forked from mindspore-Ecosystem/mindspore
add mem manager
This commit is contained in:
parent
dd9a5a385a
commit
fb343bd607
|
@ -132,6 +132,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
"kernel/kash/*.cc"
|
"kernel/kash/*.cc"
|
||||||
"device/kernel_info.cc"
|
"device/kernel_info.cc"
|
||||||
"device/kernel_runtime.cc"
|
"device/kernel_runtime.cc"
|
||||||
|
"device/memory_manager.cc"
|
||||||
"device/kernel_runtime_manager.cc"
|
"device/kernel_runtime_manager.cc"
|
||||||
"device/convert_tensor_utils.cc"
|
"device/convert_tensor_utils.cc"
|
||||||
"pre_activate/common/*.cc"
|
"pre_activate/common/*.cc"
|
||||||
|
|
|
@ -37,6 +37,7 @@
|
||||||
#include "kernel/tbe/tbe_utils.h"
|
#include "kernel/tbe/tbe_utils.h"
|
||||||
#include "kernel/tbe/tbe_python_funcs.h"
|
#include "kernel/tbe/tbe_python_funcs.h"
|
||||||
#include "pre_activate/mem_reuse/mem_reuse_checker.h"
|
#include "pre_activate/mem_reuse/mem_reuse_checker.h"
|
||||||
|
#include "device/ascend/ascend_memory_manager.h"
|
||||||
|
|
||||||
using mindspore::device::ascend::ProfilingManager;
|
using mindspore::device::ascend::ProfilingManager;
|
||||||
using mindspore::device::ascend::ProfilingUtils;
|
using mindspore::device::ascend::ProfilingUtils;
|
||||||
|
@ -47,8 +48,6 @@ using std::vector;
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace device {
|
namespace device {
|
||||||
namespace ascend {
|
namespace ascend {
|
||||||
static const uint64_t ASCEND_MEM_SIZE = 20;
|
|
||||||
static const uint64_t ASCEND_MEM_SIZE_BYTE = (ASCEND_MEM_SIZE << 30);
|
|
||||||
static const size_t PRAMATER_OUTPUT_INDEX = 0;
|
static const size_t PRAMATER_OUTPUT_INDEX = 0;
|
||||||
|
|
||||||
AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }
|
AscendKernelRuntime::~AscendKernelRuntime() { graph_model_map_.clear(); }
|
||||||
|
@ -86,7 +85,8 @@ void AscendKernelRuntime::ReleaseDeviceRes() {
|
||||||
MS_EXCEPTION(DeviceProcessError) << "rtSetDevice, ret[" << static_cast<int>(ret) << "]";
|
MS_EXCEPTION(DeviceProcessError) << "rtSetDevice, ret[" << static_cast<int>(ret) << "]";
|
||||||
}
|
}
|
||||||
|
|
||||||
FreeDeviceMemory();
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
|
mem_manager_->FreeDeviceMemory();
|
||||||
(void)DestroyHccl();
|
(void)DestroyHccl();
|
||||||
(void)ResetDevice();
|
(void)ResetDevice();
|
||||||
(void)ProfilingManager::GetInstance().StopProfiling();
|
(void)ProfilingManager::GetInstance().StopProfiling();
|
||||||
|
@ -109,11 +109,9 @@ bool AscendKernelRuntime::Init() {
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
mem_manager_ = std::make_shared<AscendMemoryManager>();
|
||||||
ret = MallocDeviceMemory();
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
if (!ret) {
|
mem_manager_->MallocDeviceMemory();
|
||||||
return ret;
|
|
||||||
}
|
|
||||||
|
|
||||||
ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
ret = ProfilingManager::GetInstance().StartupProfiling(device_id_);
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
|
@ -239,13 +237,6 @@ DeviceAddressPtr AscendKernelRuntime::CreateDeviceAddress(void *device_ptr, size
|
||||||
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
|
return std::make_shared<AscendDeviceAddress>(device_ptr, device_size, format, type_id);
|
||||||
}
|
}
|
||||||
|
|
||||||
void AscendKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) {
|
|
||||||
auto device_ptr = AscendMemoryAllocator::GetInstance().AllocTensorMem(size);
|
|
||||||
MS_EXCEPTION_IF_NULL(device_ptr);
|
|
||||||
address->ptr_ = device_ptr;
|
|
||||||
address->mem_dynamic_alloc_ = true;
|
|
||||||
}
|
|
||||||
|
|
||||||
bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
|
bool AscendKernelRuntime::GenTask(const session::KernelGraph *graph) {
|
||||||
auto context_ptr = MsContext::GetInstance();
|
auto context_ptr = MsContext::GetInstance();
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
|
@ -474,42 +465,6 @@ bool AscendKernelRuntime::DestroyHccl() {
|
||||||
context_ptr->set_enable_hccl(false);
|
context_ptr->set_enable_hccl(false);
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool AscendKernelRuntime::MallocDeviceMemory() {
|
|
||||||
device_mem_size_ = ASCEND_MEM_SIZE_BYTE;
|
|
||||||
static_mem_offset_ = FloatToSize(device_mem_size_ * GRAPH_INIT_ASCEND_MEM_RATIO);
|
|
||||||
auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM);
|
|
||||||
if (ret != RT_ERROR_NONE) {
|
|
||||||
MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]";
|
|
||||||
}
|
|
||||||
device_mem_pool_size_ = FloatToSize(device_mem_size_ * (1 - GRAPH_INIT_ASCEND_MEM_RATIO));
|
|
||||||
ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM);
|
|
||||||
if (ret != RT_ERROR_NONE) {
|
|
||||||
MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
|
|
||||||
}
|
|
||||||
AscendMemoryAllocator::GetInstance().set_device_mem_pool_base(device_mem_pool_base_);
|
|
||||||
AscendMemoryAllocator::GetInstance().set_device_mem_pool_size(device_mem_pool_size_);
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
|
|
||||||
void AscendKernelRuntime::FreeDeviceMemory() {
|
|
||||||
if (device_mem_base_ != nullptr) {
|
|
||||||
auto ret = rtFree(device_mem_base_);
|
|
||||||
if (ret != RT_ERROR_NONE) {
|
|
||||||
MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]";
|
|
||||||
}
|
|
||||||
device_mem_base_ = nullptr;
|
|
||||||
}
|
|
||||||
if (device_mem_pool_base_ != nullptr) {
|
|
||||||
auto ret = rtFree(device_mem_pool_base_);
|
|
||||||
if (ret != RT_ERROR_NONE) {
|
|
||||||
MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
|
|
||||||
}
|
|
||||||
device_mem_pool_base_ = nullptr;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void AscendKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; }
|
|
||||||
} // namespace ascend
|
} // namespace ascend
|
||||||
} // namespace device
|
} // namespace device
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -39,13 +39,11 @@ class AscendKernelRuntime : public KernelRuntime {
|
||||||
bool GenTask(const session::KernelGraph *graph) override;
|
bool GenTask(const session::KernelGraph *graph) override;
|
||||||
bool RunTask(const session::KernelGraph *graph) override;
|
bool RunTask(const session::KernelGraph *graph) override;
|
||||||
bool LoadTask(const session::KernelGraph *graph) override;
|
bool LoadTask(const session::KernelGraph *graph) override;
|
||||||
void FreeHostMemory() override;
|
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
||||||
TypeId type_id) override;
|
TypeId type_id) override;
|
||||||
bool SyncStream() override;
|
bool SyncStream() override;
|
||||||
void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
bool InitDevice();
|
bool InitDevice();
|
||||||
|
@ -53,8 +51,7 @@ class AscendKernelRuntime : public KernelRuntime {
|
||||||
bool HcclInit();
|
bool HcclInit();
|
||||||
bool NeedDestroyHccl();
|
bool NeedDestroyHccl();
|
||||||
bool DestroyHccl();
|
bool DestroyHccl();
|
||||||
bool MallocDeviceMemory();
|
|
||||||
void FreeDeviceMemory();
|
|
||||||
void ClearGraphModelMap();
|
void ClearGraphModelMap();
|
||||||
void ReleaseDeviceRes() override;
|
void ReleaseDeviceRes() override;
|
||||||
uint32_t GetGraphModelId(const session::KernelGraph *kernel_graph);
|
uint32_t GetGraphModelId(const session::KernelGraph *kernel_graph);
|
||||||
|
|
|
@ -0,0 +1,65 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "device/ascend/ascend_memory_manager.h"
|
||||||
|
#include "device/ascend/ascend_memory_allocator.h"
|
||||||
|
#include "utils/context/ms_context.h"
|
||||||
|
#include "runtime/mem.h"
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
namespace ascend {
|
||||||
|
static const uint64_t ASCEND_MEM_SIZE = 20;
|
||||||
|
static const uint64_t ASCEND_MEM_SIZE_BYTE = (ASCEND_MEM_SIZE << 30);
|
||||||
|
|
||||||
|
void AscendMemoryManager::MallocDeviceMemory() {
|
||||||
|
device_mem_size_ = ASCEND_MEM_SIZE_BYTE;
|
||||||
|
static_mem_offset_ = FloatToSize(device_mem_size_ * GRAPH_INIT_ASCEND_MEM_RATIO);
|
||||||
|
auto ret = rtMalloc(reinterpret_cast<void **>(&device_mem_base_), static_mem_offset_, RT_MEMORY_HBM);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << static_mem_offset_ << "] fail, ret[" << ret << "]";
|
||||||
|
}
|
||||||
|
device_mem_pool_size_ = FloatToSize(device_mem_size_ * (1 - GRAPH_INIT_ASCEND_MEM_RATIO));
|
||||||
|
ret = rtMalloc(reinterpret_cast<void **>(&device_mem_pool_base_), device_mem_pool_size_, RT_MEMORY_HBM);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_EXCEPTION(DeviceProcessError) << "rtMalloc mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
|
||||||
|
}
|
||||||
|
AscendMemoryAllocator::GetInstance().set_device_mem_pool_base(device_mem_pool_base_);
|
||||||
|
AscendMemoryAllocator::GetInstance().set_device_mem_pool_size(device_mem_pool_size_);
|
||||||
|
}
|
||||||
|
|
||||||
|
void AscendMemoryManager::FreeDeviceMemory() {
|
||||||
|
if (device_mem_base_ != nullptr) {
|
||||||
|
auto ret = rtFree(device_mem_base_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(ERROR) << "rtFree mem size[" << device_mem_size_ << "] fail, ret[" << ret << "]";
|
||||||
|
}
|
||||||
|
device_mem_base_ = nullptr;
|
||||||
|
}
|
||||||
|
if (device_mem_pool_base_ != nullptr) {
|
||||||
|
auto ret = rtFree(device_mem_pool_base_);
|
||||||
|
if (ret != RT_ERROR_NONE) {
|
||||||
|
MS_LOG(ERROR) << "rtFree mem size[" << device_mem_pool_size_ << "] fail, ret[" << ret << "]";
|
||||||
|
}
|
||||||
|
device_mem_pool_base_ = nullptr;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void *AscendMemoryManager::AllocTensorMemDynamic(size_t size) {
|
||||||
|
return AscendMemoryAllocator::GetInstance().AllocTensorMem(size);
|
||||||
|
}
|
||||||
|
} // namespace ascend
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,35 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
|
||||||
|
#include "device/memory_manager.h"
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
namespace ascend {
|
||||||
|
class AscendMemoryManager : public MemoryManager {
|
||||||
|
public:
|
||||||
|
AscendMemoryManager() = default;
|
||||||
|
virtual ~AscendMemoryManager() = default;
|
||||||
|
|
||||||
|
void MallocDeviceMemory() override;
|
||||||
|
void FreeDeviceMemory() override;
|
||||||
|
void *AllocTensorMemDynamic(size_t size) override;
|
||||||
|
};
|
||||||
|
} // namespace ascend
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_ASCEND_ASCEND_MEMORY_MANAGER_H_
|
|
@ -33,12 +33,14 @@ class CPUKernelRuntime;
|
||||||
} // namespace cpu
|
} // namespace cpu
|
||||||
namespace ascend {
|
namespace ascend {
|
||||||
class AscendKernelRuntime;
|
class AscendKernelRuntime;
|
||||||
|
class AscendMemoryManager;
|
||||||
namespace tasksink {
|
namespace tasksink {
|
||||||
class TaskGenerator;
|
class TaskGenerator;
|
||||||
} // namespace tasksink
|
} // namespace tasksink
|
||||||
} // namespace ascend
|
} // namespace ascend
|
||||||
namespace gpu {
|
namespace gpu {
|
||||||
class GPUKernelRuntime;
|
class GPUKernelRuntime;
|
||||||
|
class GPUMemoryManager;
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace device
|
} // namespace device
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
@ -70,12 +72,15 @@ class DeviceAddress {
|
||||||
TypeId type_id_{kNumberTypeFloat16};
|
TypeId type_id_{kNumberTypeFloat16};
|
||||||
bool mem_dynamic_alloc_{false};
|
bool mem_dynamic_alloc_{false};
|
||||||
friend class KernelRuntime;
|
friend class KernelRuntime;
|
||||||
|
friend class MemoryManager;
|
||||||
friend class mindspore::device::ascend::tasksink::TaskGenerator;
|
friend class mindspore::device::ascend::tasksink::TaskGenerator;
|
||||||
friend class mindspore::device::cpu::CPUSimpleMemPlan;
|
friend class mindspore::device::cpu::CPUSimpleMemPlan;
|
||||||
friend class mindspore::device::cpu::CPUResourceManager;
|
friend class mindspore::device::cpu::CPUResourceManager;
|
||||||
friend class mindspore::device::cpu::CPUKernelRuntime;
|
friend class mindspore::device::cpu::CPUKernelRuntime;
|
||||||
friend class mindspore::device::gpu::GPUKernelRuntime;
|
friend class mindspore::device::gpu::GPUKernelRuntime;
|
||||||
|
friend class mindspore::device::gpu::GPUMemoryManager;
|
||||||
friend class mindspore::device::ascend::AscendKernelRuntime;
|
friend class mindspore::device::ascend::AscendKernelRuntime;
|
||||||
|
friend class mindspore::device::ascend::AscendMemoryManager;
|
||||||
};
|
};
|
||||||
|
|
||||||
using DeviceAddressPtr = std::shared_ptr<DeviceAddress>;
|
using DeviceAddressPtr = std::shared_ptr<DeviceAddress>;
|
||||||
|
|
|
@ -26,6 +26,7 @@
|
||||||
#include "device/kernel_runtime_manager.h"
|
#include "device/kernel_runtime_manager.h"
|
||||||
#include "device/gpu/gpu_common.h"
|
#include "device/gpu/gpu_common.h"
|
||||||
#include "common/utils.h"
|
#include "common/utils.h"
|
||||||
|
#include "device/gpu/gpu_memory_manager.h"
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace device {
|
namespace device {
|
||||||
|
@ -36,26 +37,14 @@ bool GPUKernelRuntime::Init() {
|
||||||
if (device_init_ == true) {
|
if (device_init_ == true) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
auto ret = InitDevice();
|
auto ret = InitDevice();
|
||||||
if (!ret) {
|
if (!ret) {
|
||||||
MS_LOG(ERROR) << "InitDevice error.";
|
MS_LOG(ERROR) << "InitDevice error.";
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
mem_manager_ = std::make_shared<GPUMemoryManager>();
|
||||||
auto context_ptr = MsContext::GetInstance();
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
mem_manager_->MallocDeviceMemory();
|
||||||
// If use the dynamic memory pool, then alloc the first memory block to init.
|
|
||||||
if (context_ptr->enable_dynamic_mem_pool()) {
|
|
||||||
auto device_addr = AllocTensorMemDynamic(1);
|
|
||||||
if (!device_addr) {
|
|
||||||
MS_LOG(ERROR) << "Dynamic memory pool init error.";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
} else {
|
|
||||||
MallocDeviceMemory();
|
|
||||||
}
|
|
||||||
|
|
||||||
const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
|
const void *collective_handle_ = CollectiveInitializer::instance().collective_handle();
|
||||||
bool collective_inited = CollectiveInitializer::instance().collective_inited();
|
bool collective_inited = CollectiveInitializer::instance().collective_inited();
|
||||||
if (collective_inited && collective_handle_ != nullptr) {
|
if (collective_inited && collective_handle_ != nullptr) {
|
||||||
|
@ -101,16 +90,6 @@ bool GPUKernelRuntime::InitDevice() {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPUKernelRuntime::MallocDeviceMemory() {
|
|
||||||
// Need to reserve 20% space for dynamic memory
|
|
||||||
const float init_gpu_mem_ratio = 0.8;
|
|
||||||
size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio);
|
|
||||||
auto alloc_size =
|
|
||||||
GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast<void **>(&device_mem_base_));
|
|
||||||
device_mem_size_ = alloc_size;
|
|
||||||
static_mem_offset_ = device_mem_size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPUKernelRuntime::ReleaseDeviceRes() {
|
void GPUKernelRuntime::ReleaseDeviceRes() {
|
||||||
// For dataset mode.
|
// For dataset mode.
|
||||||
if (GpuBufferMgr::GetInstance().IsInit()) {
|
if (GpuBufferMgr::GetInstance().IsInit()) {
|
||||||
|
@ -122,39 +101,22 @@ void GPUKernelRuntime::ReleaseDeviceRes() {
|
||||||
CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
|
CHECK_OP_RET_WITH_EXCEPT(GpuBufferMgr::GetInstance().Destroy(), "Could not destroy gpu data queue.");
|
||||||
}
|
}
|
||||||
GPUDeviceManager::GetInstance().ReleaseDevice();
|
GPUDeviceManager::GetInstance().ReleaseDevice();
|
||||||
if (device_mem_base_ != nullptr) {
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) {
|
mem_manager_->FreeDeviceMemory();
|
||||||
MS_LOG(EXCEPTION) << "Could not free gpu device memory.";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
GPUMemoryAllocator::GetInstance().ReleaseDeviceRes();
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPUKernelRuntime::FreeHostMemory() { dynamic_mem_offset_ = 0; }
|
|
||||||
|
|
||||||
void *GPUKernelRuntime::AllocTensorMemDynamic(size_t size) {
|
|
||||||
return GPUMemoryAllocator::GetInstance().AllocTensorMem(size);
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPUKernelRuntime::FreeTensorMemDynamic(void *device_ptr) {
|
|
||||||
GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr);
|
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
|
void GPUKernelRuntime::AssignMemory(session::KernelGraph *graph) {
|
||||||
auto context_ptr = MsContext::GetInstance();
|
auto context_ptr = MsContext::GetInstance();
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
|
mem_manager_->ResetDynamicMemory();
|
||||||
AssignStaticMemory(graph);
|
AssignStaticMemory(graph);
|
||||||
bool is_enable_mem_reuse = context_ptr->enable_mem_reuse();
|
|
||||||
bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
|
bool is_enable_dynamic_mem = context_ptr->enable_dynamic_mem_pool();
|
||||||
if (is_enable_dynamic_mem) {
|
if (is_enable_dynamic_mem) {
|
||||||
// Use the dynamic memory pool.
|
// Use the dynamic memory pool.
|
||||||
InitKernelRefCount(graph);
|
InitKernelRefCount(graph);
|
||||||
InitKernelOutputAddress(graph);
|
InitKernelOutputAddress(graph);
|
||||||
} else if (is_enable_mem_reuse) {
|
|
||||||
// Use the memory reuse.
|
|
||||||
ReuseAssignDynamicMemory(graph);
|
|
||||||
} else {
|
} else {
|
||||||
// Normal way.
|
|
||||||
AssignDynamicMemory(graph);
|
AssignDynamicMemory(graph);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -179,32 +141,6 @@ bool GPUKernelRuntime::Run(session::KernelGraph *graph) {
|
||||||
return ret;
|
return ret;
|
||||||
}
|
}
|
||||||
|
|
||||||
uint8_t *GPUKernelRuntime::MallocStaticMem(size_t size, bool) {
|
|
||||||
auto context_ptr = MsContext::GetInstance();
|
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
|
||||||
if (context_ptr->enable_dynamic_mem_pool()) {
|
|
||||||
auto device_ptr = AllocTensorMemDynamic(size);
|
|
||||||
MS_EXCEPTION_IF_NULL(device_ptr);
|
|
||||||
return AddressOffset(device_ptr, 0);
|
|
||||||
}
|
|
||||||
|
|
||||||
auto align_size = GetCommonAlignSize(size);
|
|
||||||
if (static_mem_offset_ < align_size) {
|
|
||||||
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
|
||||||
<< "] static[" << total_static_size_ << "])"
|
|
||||||
<< " malloc [" << align_size << "] failed!";
|
|
||||||
}
|
|
||||||
auto offset = static_mem_offset_ - align_size;
|
|
||||||
if (dynamic_mem_offset_ > offset) {
|
|
||||||
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
|
||||||
<< "] static[" << total_static_size_ << "])"
|
|
||||||
<< " malloc [" << align_size << "] failed!";
|
|
||||||
}
|
|
||||||
total_static_size_ += align_size;
|
|
||||||
static_mem_offset_ = offset;
|
|
||||||
return device_mem_base_ + offset;
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
|
void GPUKernelRuntime::InitKernelRefCount(const session::KernelGraph *graph) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
|
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
|
||||||
|
@ -273,6 +209,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
|
||||||
MS_EXCEPTION_IF_NULL(kernel_inputs);
|
MS_EXCEPTION_IF_NULL(kernel_inputs);
|
||||||
MS_EXCEPTION_IF_NULL(kernel_workspaces);
|
MS_EXCEPTION_IF_NULL(kernel_workspaces);
|
||||||
MS_EXCEPTION_IF_NULL(kernel_outputs);
|
MS_EXCEPTION_IF_NULL(kernel_outputs);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
|
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(kernel); ++i) {
|
||||||
auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i);
|
auto device_address = AnfAlgo::GetPrevNodeOutputAddr(kernel, i);
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
|
@ -290,7 +227,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
auto device_ptr = device_address->ptr_;
|
auto device_ptr = device_address->ptr_;
|
||||||
if (device_ptr == nullptr) {
|
if (device_ptr == nullptr) {
|
||||||
device_ptr = AllocTensorMemDynamic(output_sizes[i]);
|
device_ptr = mem_manager_->AllocTensorMemDynamic(output_sizes[i]);
|
||||||
MS_EXCEPTION_IF_NULL(device_ptr);
|
MS_EXCEPTION_IF_NULL(device_ptr);
|
||||||
device_address->ptr_ = device_ptr;
|
device_address->ptr_ = device_ptr;
|
||||||
}
|
}
|
||||||
|
@ -307,7 +244,7 @@ void GPUKernelRuntime::AllocKernelDynamicRes(const mindspore::kernel::KernelMod
|
||||||
kernel_workspaces->emplace_back(nullptr);
|
kernel_workspaces->emplace_back(nullptr);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto device_ptr = AllocTensorMemDynamic(workspace_sizes[i]);
|
auto device_ptr = mem_manager_->AllocTensorMemDynamic(workspace_sizes[i]);
|
||||||
MS_EXCEPTION_IF_NULL(device_ptr);
|
MS_EXCEPTION_IF_NULL(device_ptr);
|
||||||
kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
|
kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
|
||||||
MS_EXCEPTION_IF_NULL(workspace);
|
MS_EXCEPTION_IF_NULL(workspace);
|
||||||
|
@ -333,6 +270,7 @@ void GPUKernelRuntime::AllocCommunicationOpDynamicRes(const session::KernelGraph
|
||||||
|
|
||||||
void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
|
void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfNodePtr &kernel) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel);
|
MS_EXCEPTION_IF_NULL(kernel);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
// The reference count of communication kernel input is not 0.
|
// The reference count of communication kernel input is not 0.
|
||||||
if (communication_op_input_ref_count_ != 0) {
|
if (communication_op_input_ref_count_ != 0) {
|
||||||
MS_LOG(ERROR) << "The reference count of communication kernel input is not 0.";
|
MS_LOG(ERROR) << "The reference count of communication kernel input is not 0.";
|
||||||
|
@ -354,7 +292,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN
|
||||||
addr_size.emplace_back(device_address.get(), output_size);
|
addr_size.emplace_back(device_address.get(), output_size);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto device_mem_ptr = AllocTensorMemDynamic(total);
|
auto device_mem_ptr = mem_manager_->AllocTensorMemDynamic(total);
|
||||||
MS_EXCEPTION_IF_NULL(device_mem_ptr);
|
MS_EXCEPTION_IF_NULL(device_mem_ptr);
|
||||||
for (const auto &iter : addr_size) {
|
for (const auto &iter : addr_size) {
|
||||||
MS_EXCEPTION_IF_NULL(iter.first);
|
MS_EXCEPTION_IF_NULL(iter.first);
|
||||||
|
@ -366,6 +304,7 @@ void GPUKernelRuntime::AllocCommunicationOpInputDynamicRes(const mindspore::AnfN
|
||||||
|
|
||||||
void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
|
void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::AnfNodePtr &kernel) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel);
|
MS_EXCEPTION_IF_NULL(kernel);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
// The reference count of communication kernel output is not 0.
|
// The reference count of communication kernel output is not 0.
|
||||||
if (communication_op_output_ref_count_ != 0) {
|
if (communication_op_output_ref_count_ != 0) {
|
||||||
MS_LOG(ERROR) << "The reference count of communication kernel output is not 0.";
|
MS_LOG(ERROR) << "The reference count of communication kernel output is not 0.";
|
||||||
|
@ -389,7 +328,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
|
||||||
addr_size.emplace_back(device_address.get(), output_sizes[i]);
|
addr_size.emplace_back(device_address.get(), output_sizes[i]);
|
||||||
}
|
}
|
||||||
|
|
||||||
auto device_mem_ptr = AllocTensorMemDynamic(total);
|
auto device_mem_ptr = mem_manager_->AllocTensorMemDynamic(total);
|
||||||
MS_EXCEPTION_IF_NULL(device_mem_ptr);
|
MS_EXCEPTION_IF_NULL(device_mem_ptr);
|
||||||
for (const auto &iter : addr_size) {
|
for (const auto &iter : addr_size) {
|
||||||
MS_EXCEPTION_IF_NULL(iter.first);
|
MS_EXCEPTION_IF_NULL(iter.first);
|
||||||
|
@ -402,6 +341,7 @@ void GPUKernelRuntime::AllocCommunicationOpOutputDynamicRes(const mindspore::Anf
|
||||||
void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
|
void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
|
||||||
const AddressPtrList &kernel_workspaces) {
|
const AddressPtrList &kernel_workspaces) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel);
|
MS_EXCEPTION_IF_NULL(kernel);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
auto cnode = kernel->cast<CNodePtr>();
|
auto cnode = kernel->cast<CNodePtr>();
|
||||||
MS_EXCEPTION_IF_NULL(cnode);
|
MS_EXCEPTION_IF_NULL(cnode);
|
||||||
// Free the input of kernel by reference count.
|
// Free the input of kernel by reference count.
|
||||||
|
@ -421,7 +361,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
|
||||||
auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
|
auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, i);
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
MS_EXCEPTION_IF_NULL(device_address->ptr_);
|
MS_EXCEPTION_IF_NULL(device_address->ptr_);
|
||||||
FreeTensorMemDynamic(device_address->ptr_);
|
mem_manager_->FreeTensorMemDynamic(device_address->ptr_);
|
||||||
device_address->ptr_ = nullptr;
|
device_address->ptr_ = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -432,7 +372,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
|
||||||
auto workspace = kernel_workspaces[i];
|
auto workspace = kernel_workspaces[i];
|
||||||
if (workspace != nullptr) {
|
if (workspace != nullptr) {
|
||||||
MS_EXCEPTION_IF_NULL(workspace->addr);
|
MS_EXCEPTION_IF_NULL(workspace->addr);
|
||||||
FreeTensorMemDynamic(workspace->addr);
|
mem_manager_->FreeTensorMemDynamic(workspace->addr);
|
||||||
workspace->addr = nullptr;
|
workspace->addr = nullptr;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -441,6 +381,7 @@ void GPUKernelRuntime::FreeKernelDynamicRes(const mindspore::AnfNodePtr &kernel,
|
||||||
void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx,
|
void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx,
|
||||||
bool *is_communication_op) {
|
bool *is_communication_op) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel);
|
MS_EXCEPTION_IF_NULL(kernel);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
// The inputs memory of communication kernel is one piece memory, need release together.
|
// The inputs memory of communication kernel is one piece memory, need release together.
|
||||||
if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) {
|
if (AnfAlgo::GetCNodeName(kernel) == kAllReduceOpName) {
|
||||||
communication_op_input_ref_count_--;
|
communication_op_input_ref_count_--;
|
||||||
|
@ -448,7 +389,7 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr
|
||||||
auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, 0);
|
auto device_address = AnfAlgo::GetPrevNodeMutableOutputAddr(kernel, 0);
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
MS_EXCEPTION_IF_NULL(device_address->ptr_);
|
MS_EXCEPTION_IF_NULL(device_address->ptr_);
|
||||||
FreeTensorMemDynamic(device_address->ptr_);
|
mem_manager_->FreeTensorMemDynamic(device_address->ptr_);
|
||||||
device_address->ptr_ = nullptr;
|
device_address->ptr_ = nullptr;
|
||||||
}
|
}
|
||||||
*is_communication_op = true;
|
*is_communication_op = true;
|
||||||
|
@ -470,19 +411,12 @@ void GPUKernelRuntime::FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr
|
||||||
auto device_address = AnfAlgo::GetMutableOutputAddr(kernel_input.first, 0);
|
auto device_address = AnfAlgo::GetMutableOutputAddr(kernel_input.first, 0);
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
MS_EXCEPTION_IF_NULL(device_address->ptr_);
|
MS_EXCEPTION_IF_NULL(device_address->ptr_);
|
||||||
FreeTensorMemDynamic(device_address->ptr_);
|
mem_manager_->FreeTensorMemDynamic(device_address->ptr_);
|
||||||
device_address->ptr_ = nullptr;
|
device_address->ptr_ = nullptr;
|
||||||
}
|
}
|
||||||
*is_communication_op = true;
|
*is_communication_op = true;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void GPUKernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int) {
|
|
||||||
auto device_ptr = AllocTensorMemDynamic(size);
|
|
||||||
MS_EXCEPTION_IF_NULL(device_ptr);
|
|
||||||
address->ptr_ = device_ptr;
|
|
||||||
address->mem_dynamic_alloc_ = true;
|
|
||||||
}
|
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
} // namespace device
|
} // namespace device
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -33,7 +33,6 @@ class GPUKernelRuntime : public KernelRuntime {
|
||||||
~GPUKernelRuntime() override = default;
|
~GPUKernelRuntime() override = default;
|
||||||
bool Init() override;
|
bool Init() override;
|
||||||
void ReleaseDeviceRes() override;
|
void ReleaseDeviceRes() override;
|
||||||
void FreeHostMemory() override;
|
|
||||||
void AssignMemory(session::KernelGraph *graph) override;
|
void AssignMemory(session::KernelGraph *graph) override;
|
||||||
bool Run(session::KernelGraph *graph) override;
|
bool Run(session::KernelGraph *graph) override;
|
||||||
|
|
||||||
|
@ -41,18 +40,11 @@ class GPUKernelRuntime : public KernelRuntime {
|
||||||
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
DeviceAddressPtr CreateDeviceAddress(void *device_ptr, size_t device_size, const string &format,
|
||||||
TypeId type_id) override;
|
TypeId type_id) override;
|
||||||
bool SyncStream() override;
|
bool SyncStream() override;
|
||||||
// Alloc memory use the dynamic memory pool.
|
|
||||||
void *AllocTensorMemDynamic(size_t size) override;
|
|
||||||
// Free memory use the dynamic memory pool.
|
|
||||||
void FreeTensorMemDynamic(void *device_ptr) override;
|
|
||||||
void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) override;
|
|
||||||
uint8_t *MallocStaticMem(size_t size, bool communication_mem) override;
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
GPUKernelRuntime(const GPUKernelRuntime &);
|
GPUKernelRuntime(const GPUKernelRuntime &);
|
||||||
GPUKernelRuntime &operator=(const GPUKernelRuntime &);
|
GPUKernelRuntime &operator=(const GPUKernelRuntime &);
|
||||||
bool InitDevice();
|
bool InitDevice();
|
||||||
void MallocDeviceMemory();
|
|
||||||
bool device_init_{false};
|
bool device_init_{false};
|
||||||
|
|
||||||
// The related functions and members for using dynamic memory pool.
|
// The related functions and members for using dynamic memory pool.
|
||||||
|
@ -69,6 +61,7 @@ class GPUKernelRuntime : public KernelRuntime {
|
||||||
void FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, bool *is_communication_op);
|
void FreeCommunicationOpDynamicRes(const mindspore::AnfNodePtr &kernel, size_t input_idx, bool *is_communication_op);
|
||||||
size_t communication_op_input_ref_count_{0};
|
size_t communication_op_input_ref_count_{0};
|
||||||
size_t communication_op_output_ref_count_{0};
|
size_t communication_op_output_ref_count_{0};
|
||||||
|
MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
|
||||||
};
|
};
|
||||||
MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
|
MS_REG_KERNEL_RUNTIME(kGPUDevice, GPUKernelRuntime);
|
||||||
} // namespace gpu
|
} // namespace gpu
|
||||||
|
|
|
@ -0,0 +1,88 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "device/gpu/gpu_memory_manager.h"
|
||||||
|
#include "device/gpu/gpu_memory_allocator.h"
|
||||||
|
#include "utils/context/ms_context.h"
|
||||||
|
#include "utils/convert_utils.h"
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
namespace gpu {
|
||||||
|
void *GPUMemoryManager::AllocTensorMemDynamic(size_t size) {
|
||||||
|
return GPUMemoryAllocator::GetInstance().AllocTensorMem(size);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUMemoryManager::FreeTensorMemDynamic(void *device_ptr) {
|
||||||
|
GPUMemoryAllocator::GetInstance().FreeTensorMem(device_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUMemoryManager::MallocDeviceMemory() {
|
||||||
|
auto context_ptr = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
|
// If use the dynamic memory pool, then alloc the first memory block to init.
|
||||||
|
if (context_ptr->enable_dynamic_mem_pool()) {
|
||||||
|
auto device_addr = AllocTensorMemDynamic(1);
|
||||||
|
if (!device_addr) {
|
||||||
|
MS_LOG(ERROR) << "Dynamic memory pool init error.";
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
// Need to reserve 20% space for dynamic memory
|
||||||
|
const float init_gpu_mem_ratio = 0.8;
|
||||||
|
size_t mem_size = FloatToSize(GPUMemoryAllocator::GetInstance().free_mem_size() * init_gpu_mem_ratio);
|
||||||
|
auto alloc_size =
|
||||||
|
GPUMemoryAllocator::GetInstance().AllocDeviceMem(mem_size, reinterpret_cast<void **>(&device_mem_base_));
|
||||||
|
device_mem_size_ = alloc_size;
|
||||||
|
static_mem_offset_ = device_mem_size_;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void GPUMemoryManager::FreeDeviceMemory() {
|
||||||
|
if (device_mem_base_ != nullptr) {
|
||||||
|
if (!GPUMemoryAllocator::GetInstance().FreeDeviceMem(device_mem_base_)) {
|
||||||
|
MS_LOG(EXCEPTION) << "Could not free gpu device memory.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
GPUMemoryAllocator::GetInstance().ReleaseDeviceRes();
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *GPUMemoryManager::MallocStaticMem(size_t size, bool) {
|
||||||
|
auto context_ptr = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
|
if (context_ptr->enable_dynamic_mem_pool()) {
|
||||||
|
auto device_ptr = AllocTensorMemDynamic(size);
|
||||||
|
MS_EXCEPTION_IF_NULL(device_ptr);
|
||||||
|
return AddressOffset(device_ptr, 0);
|
||||||
|
}
|
||||||
|
|
||||||
|
auto align_size = GetCommonAlignSize(size);
|
||||||
|
if (static_mem_offset_ < align_size) {
|
||||||
|
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
||||||
|
<< "] static[" << total_static_size_ << "])"
|
||||||
|
<< " malloc [" << align_size << "] failed!";
|
||||||
|
}
|
||||||
|
auto offset = static_mem_offset_ - align_size;
|
||||||
|
if (dynamic_mem_offset_ > offset) {
|
||||||
|
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
||||||
|
<< "] static[" << total_static_size_ << "])"
|
||||||
|
<< " malloc [" << align_size << "] failed!";
|
||||||
|
}
|
||||||
|
total_static_size_ += align_size;
|
||||||
|
static_mem_offset_ = offset;
|
||||||
|
return device_mem_base_ + offset;
|
||||||
|
}
|
||||||
|
} // namespace gpu
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,40 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
|
||||||
|
#include "device/memory_manager.h"
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
namespace gpu {
|
||||||
|
class GPUMemoryManager : public MemoryManager {
|
||||||
|
public:
|
||||||
|
GPUMemoryManager() = default;
|
||||||
|
virtual ~GPUMemoryManager() = default;
|
||||||
|
|
||||||
|
void MallocDeviceMemory() override;
|
||||||
|
void FreeDeviceMemory() override;
|
||||||
|
|
||||||
|
void *AllocTensorMemDynamic(size_t size) override;
|
||||||
|
void FreeTensorMemDynamic(void *device_ptr) override;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
uint8_t *MallocStaticMem(size_t size, bool communication_mem);
|
||||||
|
};
|
||||||
|
} // namespace gpu
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_GPU_GPU_MEMORY_MANAGER_H_
|
|
@ -31,18 +31,13 @@
|
||||||
#include "ir/value.h"
|
#include "ir/value.h"
|
||||||
using mindspore::kernel::Address;
|
using mindspore::kernel::Address;
|
||||||
using mindspore::kernel::AddressPtr;
|
using mindspore::kernel::AddressPtr;
|
||||||
using mindspore::memreuse::BestFitMemReuse;
|
|
||||||
using mindspore::memreuse::MemReuseUtilPtr;
|
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace device {
|
namespace device {
|
||||||
KernelRuntime::~KernelRuntime() {
|
KernelRuntime::~KernelRuntime() {
|
||||||
device_mem_base_ = nullptr;
|
|
||||||
device_mem_pool_base_ = nullptr;
|
|
||||||
#ifdef ENABLE_DUMP_E2E
|
#ifdef ENABLE_DUMP_E2E
|
||||||
dump_conf_ptr_ = nullptr;
|
dump_conf_ptr_ = nullptr;
|
||||||
#endif
|
#endif
|
||||||
mem_reuse_util_ptr_ = nullptr;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
bool KernelRuntime::Run(session::KernelGraph *graph) {
|
bool KernelRuntime::Run(session::KernelGraph *graph) {
|
||||||
|
@ -88,11 +83,6 @@ bool KernelRuntime::LoadTask(const session::KernelGraph *graph) {
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
void KernelRuntime::FreeHostMemory() {
|
|
||||||
dynamic_mem_offset_ = 0;
|
|
||||||
static_mem_offset_ = 0;
|
|
||||||
}
|
|
||||||
|
|
||||||
// for D to impl
|
// for D to impl
|
||||||
bool KernelRuntime::RunTask(const session::KernelGraph *graph) {
|
bool KernelRuntime::RunTask(const session::KernelGraph *graph) {
|
||||||
if (graph != nullptr) {
|
if (graph != nullptr) {
|
||||||
|
@ -126,13 +116,11 @@ size_t KernelRuntime::CountNodeDeviceMemorySize(const mindspore::AnfNodePtr &nod
|
||||||
void KernelRuntime::AssignMemory(session::KernelGraph *graph) {
|
void KernelRuntime::AssignMemory(session::KernelGraph *graph) {
|
||||||
auto context_ptr = MsContext::GetInstance();
|
auto context_ptr = MsContext::GetInstance();
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
|
mem_manager_->ResetDynamicMemory();
|
||||||
AssignStaticMemory(graph);
|
AssignStaticMemory(graph);
|
||||||
bool is_enable_mem_reuse = context_ptr->enable_mem_reuse();
|
|
||||||
if (is_enable_mem_reuse) {
|
|
||||||
ReuseAssignDynamicMemory(graph);
|
|
||||||
} else {
|
|
||||||
AssignDynamicMemory(graph);
|
AssignDynamicMemory(graph);
|
||||||
}
|
|
||||||
UpdateRefNodeOutputMem(graph);
|
UpdateRefNodeOutputMem(graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -159,6 +147,7 @@ void KernelRuntime::AssignStaticMemory(session::KernelGraph *graph) {
|
||||||
void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> &input_tensors,
|
void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr> &input_tensors,
|
||||||
const session::KernelGraph *graph) {
|
const session::KernelGraph *graph) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
for (size_t input_index = 0; input_index < graph->inputs().size(); ++input_index) {
|
for (size_t input_index = 0; input_index < graph->inputs().size(); ++input_index) {
|
||||||
auto item = graph->inputs()[input_index];
|
auto item = graph->inputs()[input_index];
|
||||||
MS_EXCEPTION_IF_NULL(item);
|
MS_EXCEPTION_IF_NULL(item);
|
||||||
|
@ -180,7 +169,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
|
||||||
auto device_address =
|
auto device_address =
|
||||||
CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
|
CreateDeviceAddress(nullptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
MallocOpMemory(device_address, tensor_size, kStaticMem);
|
mem_manager_->MallocOpMemory(device_address, tensor_size);
|
||||||
AnfAlgo::SetOutputAddr(device_address, index, item.get());
|
AnfAlgo::SetOutputAddr(device_address, index, item.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -188,6 +177,7 @@ void KernelRuntime::RunOpAssignInputMemory(const std::vector<tensor::TensorPtr>
|
||||||
|
|
||||||
void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) {
|
void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel);
|
MS_EXCEPTION_IF_NULL(kernel);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
|
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
|
||||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||||
auto output_sizes = kernel_mod->GetOutputSizeList();
|
auto output_sizes = kernel_mod->GetOutputSizeList();
|
||||||
|
@ -208,13 +198,14 @@ void KernelRuntime::RunOpAssignOutputMemory(const AnfNodePtr &kernel) {
|
||||||
auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
|
auto output_type = AnfAlgo::GetOutputDeviceDataType(kernel, i);
|
||||||
auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
|
auto device_address = CreateDeviceAddress(nullptr, output_sizes[i], output_format, output_type);
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
MallocOpMemory(device_address, output_sizes[i], kDynamicMem);
|
mem_manager_->MallocOpMemory(device_address, output_sizes[i]);
|
||||||
AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
|
AnfAlgo::SetOutputAddr(device_address, i, kernel.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
|
void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
|
||||||
MS_EXCEPTION_IF_NULL(kernel);
|
MS_EXCEPTION_IF_NULL(kernel);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
if (kernel->isa<CNode>()) {
|
if (kernel->isa<CNode>()) {
|
||||||
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
|
auto kernel_mod = AnfAlgo::GetKernelMod(kernel);
|
||||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||||
|
@ -222,7 +213,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
|
||||||
for (size_t i = 0; i < workspace_lists.size(); ++i) {
|
for (size_t i = 0; i < workspace_lists.size(); ++i) {
|
||||||
auto device_address = CreateDeviceAddress(nullptr, workspace_lists[i], "", kTypeUnknown);
|
auto device_address = CreateDeviceAddress(nullptr, workspace_lists[i], "", kTypeUnknown);
|
||||||
MS_EXCEPTION_IF_NULL(device_address);
|
MS_EXCEPTION_IF_NULL(device_address);
|
||||||
MallocOpMemory(device_address, workspace_lists[i], kDynamicMem);
|
mem_manager_->MallocOpMemory(device_address, workspace_lists[i]);
|
||||||
AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get());
|
AnfAlgo::SetWorkspaceAddr(device_address, i, kernel.get());
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
@ -230,6 +221,7 @@ void KernelRuntime::RunOpAssignWorkSpaceMemory(const AnfNodePtr &kernel) {
|
||||||
|
|
||||||
void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
|
void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
for (auto &item : graph->inputs()) {
|
for (auto &item : graph->inputs()) {
|
||||||
MS_EXCEPTION_IF_NULL(item);
|
MS_EXCEPTION_IF_NULL(item);
|
||||||
if (!item->isa<Parameter>()) {
|
if (!item->isa<Parameter>()) {
|
||||||
|
@ -247,7 +239,7 @@ void KernelRuntime::AssignStaticMemoryInput(const session::KernelGraph *graph) {
|
||||||
output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
|
output_type_id = AnfAlgo::GetOutputInferDataType(item, index);
|
||||||
}
|
}
|
||||||
auto tensor_size = CountNodeDeviceMemorySize(item, index);
|
auto tensor_size = CountNodeDeviceMemorySize(item, index);
|
||||||
auto ptr = MallocStaticMem(tensor_size, false);
|
auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size);
|
||||||
auto address = CreateDeviceAddress(ptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
|
auto address = CreateDeviceAddress(ptr, tensor_size, AnfAlgo::GetOutputFormat(item, index), output_type_id);
|
||||||
AnfAlgo::SetOutputAddr(address, index, item.get());
|
AnfAlgo::SetOutputAddr(address, index, item.get());
|
||||||
}
|
}
|
||||||
|
@ -301,6 +293,7 @@ void KernelRuntime::UpdateRefNodeOutputMem(const session::KernelGraph *graph) {
|
||||||
|
|
||||||
void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node) {
|
void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node) {
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
auto kernel_mod = AnfAlgo::GetKernelMod(node);
|
auto kernel_mod = AnfAlgo::GetKernelMod(node);
|
||||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||||
auto output_sizes = kernel_mod->GetOutputSizeList();
|
auto output_sizes = kernel_mod->GetOutputSizeList();
|
||||||
|
@ -314,12 +307,12 @@ void KernelRuntime::AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr
|
||||||
std::vector<size_t> align_size_list;
|
std::vector<size_t> align_size_list;
|
||||||
for (uint64_t mem_size : output_sizes) {
|
for (uint64_t mem_size : output_sizes) {
|
||||||
if (context_ptr->enable_hccl()) {
|
if (context_ptr->enable_hccl()) {
|
||||||
mem_size = GetCommonAlignSize(mem_size);
|
mem_size = mem_manager_->GetCommonAlignSize(mem_size);
|
||||||
}
|
}
|
||||||
total_size += mem_size;
|
total_size += mem_size;
|
||||||
align_size_list.emplace_back(mem_size);
|
align_size_list.emplace_back(mem_size);
|
||||||
}
|
}
|
||||||
uint8_t *output_ptr = CalDeviceMem(node, total_size, flag, 0);
|
uint8_t *output_ptr = mem_manager_->MallocOutputMem(node, 0, flag, total_size);
|
||||||
for (size_t j = 0; j < align_size_list.size(); ++j) {
|
for (size_t j = 0; j < align_size_list.size(); ++j) {
|
||||||
std::string output_format = AnfAlgo::GetOutputFormat(node, j);
|
std::string output_format = AnfAlgo::GetOutputFormat(node, j);
|
||||||
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, j);
|
auto output_type = AnfAlgo::GetOutputDeviceDataType(node, j);
|
||||||
|
@ -333,6 +326,7 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
|
||||||
auto context_ptr = MsContext::GetInstance();
|
auto context_ptr = MsContext::GetInstance();
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
size_t total_size = 0;
|
size_t total_size = 0;
|
||||||
std::vector<std::pair<mindspore::device::DeviceAddress *, size_t>> addr_size;
|
std::vector<std::pair<mindspore::device::DeviceAddress *, size_t>> addr_size;
|
||||||
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(node); ++i) {
|
for (size_t i = 0; i < AnfAlgo::GetInputTensorNum(node); ++i) {
|
||||||
|
@ -340,12 +334,12 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
|
||||||
MS_EXCEPTION_IF_NULL(address);
|
MS_EXCEPTION_IF_NULL(address);
|
||||||
auto mem_size = address->size();
|
auto mem_size = address->size();
|
||||||
if (context_ptr->enable_hccl()) {
|
if (context_ptr->enable_hccl()) {
|
||||||
mem_size = GetCommonAlignSize(mem_size);
|
mem_size = mem_manager_->GetCommonAlignSize(mem_size);
|
||||||
}
|
}
|
||||||
total_size += mem_size;
|
total_size += mem_size;
|
||||||
addr_size.emplace_back(address.get(), mem_size);
|
addr_size.emplace_back(address.get(), mem_size);
|
||||||
}
|
}
|
||||||
uint8_t *input_ptr = CalDeviceMem(node, total_size, kDynamicMem, 0);
|
uint8_t *input_ptr = mem_manager_->MallocOutputMem(node, 0, kDynamicMem, total_size);
|
||||||
for (const auto &iter : addr_size) {
|
for (const auto &iter : addr_size) {
|
||||||
MS_EXCEPTION_IF_NULL(iter.first);
|
MS_EXCEPTION_IF_NULL(iter.first);
|
||||||
iter.first->set_ptr(input_ptr);
|
iter.first->set_ptr(input_ptr);
|
||||||
|
@ -355,7 +349,8 @@ void KernelRuntime::UpdateCommunicationOpInputMem(const AnfNodePtr &node) {
|
||||||
|
|
||||||
void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index) {
|
void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index) {
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
if (IsCommunicationOp(node)) {
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
|
if (AnfAlgo::IsCommunicationOp(node)) {
|
||||||
UpdateCommunicationOpInputMem(node);
|
UpdateCommunicationOpInputMem(node);
|
||||||
AssignCommunicationNodeOutputMem(flag, node);
|
AssignCommunicationNodeOutputMem(flag, node);
|
||||||
return;
|
return;
|
||||||
|
@ -375,7 +370,7 @@ void KernelRuntime::AssignNodeOutputMem(int flag, const AnfNodePtr &node, int in
|
||||||
MS_LOG(INFO) << "Already malloc index:" << i;
|
MS_LOG(INFO) << "Already malloc index:" << i;
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
auto ptr = CalDeviceMem(node, output_sizes[i], flag, i);
|
auto ptr = mem_manager_->MallocOutputMem(node, i, flag, output_sizes[i]);
|
||||||
if (ptr == nullptr) {
|
if (ptr == nullptr) {
|
||||||
// reused ptr, no need alloc, continue;
|
// reused ptr, no need alloc, continue;
|
||||||
continue;
|
continue;
|
||||||
|
@ -390,6 +385,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
|
||||||
size_t output_idx) {
|
size_t output_idx) {
|
||||||
MS_EXCEPTION_IF_NULL(value_node);
|
MS_EXCEPTION_IF_NULL(value_node);
|
||||||
MS_EXCEPTION_IF_NULL(node_value);
|
MS_EXCEPTION_IF_NULL(node_value);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
auto tensor = node_value->cast<TensorPtr>();
|
auto tensor = node_value->cast<TensorPtr>();
|
||||||
if (tensor == nullptr) {
|
if (tensor == nullptr) {
|
||||||
MS_LOG(WARNING) << "Tensor is null";
|
MS_LOG(WARNING) << "Tensor is null";
|
||||||
|
@ -397,7 +393,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
|
||||||
}
|
}
|
||||||
size_t tensor_size = tensor->data().nbytes();
|
size_t tensor_size = tensor->data().nbytes();
|
||||||
auto node_size = CountNodeDeviceMemorySize(value_node, output_idx);
|
auto node_size = CountNodeDeviceMemorySize(value_node, output_idx);
|
||||||
auto ptr = MallocStaticMem(node_size, false);
|
auto ptr = mem_manager_->MallocMem(kStaticMem, node_size);
|
||||||
TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(value_node, output_idx);
|
TypeId output_type_id = AnfAlgo::GetOutputDeviceDataType(value_node, output_idx);
|
||||||
if (output_type_id == kTypeUnknown) {
|
if (output_type_id == kTypeUnknown) {
|
||||||
output_type_id = AnfAlgo::GetOutputInferDataType(value_node, output_idx);
|
output_type_id = AnfAlgo::GetOutputInferDataType(value_node, output_idx);
|
||||||
|
@ -414,6 +410,7 @@ void KernelRuntime::AssignValueNodeTensor(const ValueNodePtr &value_node, const
|
||||||
|
|
||||||
void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
|
void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
for (auto &value_node : graph->graph_value_nodes()) {
|
for (auto &value_node : graph->graph_value_nodes()) {
|
||||||
MS_EXCEPTION_IF_NULL(value_node);
|
MS_EXCEPTION_IF_NULL(value_node);
|
||||||
if (AnfAlgo::OutputAddrExist(value_node, 0)) {
|
if (AnfAlgo::OutputAddrExist(value_node, 0)) {
|
||||||
|
@ -440,7 +437,7 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
|
||||||
} else if (node_value->isa<StringImm>()) {
|
} else if (node_value->isa<StringImm>()) {
|
||||||
auto value = GetValue<std::string>(node_value);
|
auto value = GetValue<std::string>(node_value);
|
||||||
size_t tensor_size = value.size();
|
size_t tensor_size = value.size();
|
||||||
auto ptr = MallocStaticMem(tensor_size, false);
|
auto ptr = mem_manager_->MallocMem(kStaticMem, tensor_size);
|
||||||
auto address = CreateDeviceAddress(ptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8);
|
auto address = CreateDeviceAddress(ptr, tensor_size, kOpFormat_DEFAULT, kNumberTypeUInt8);
|
||||||
MS_EXCEPTION_IF_NULL(address);
|
MS_EXCEPTION_IF_NULL(address);
|
||||||
AnfAlgo::SetOutputAddr(address, 0, value_node.get());
|
AnfAlgo::SetOutputAddr(address, 0, value_node.get());
|
||||||
|
@ -452,102 +449,36 @@ void KernelRuntime::AssignStaticMemoryValueNode(session::KernelGraph *graph) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void KernelRuntime::AssignDynamicMemory(const session::KernelGraph *graph) {
|
void KernelRuntime::AssignDynamicMemory(session::KernelGraph *graph) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
// reset dynamic mem offset
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
dynamic_mem_offset_ = 0;
|
auto context_ptr = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
|
bool is_enable_mem_reuse = context_ptr->enable_mem_reuse();
|
||||||
|
auto mem_flag = kDynamicMem;
|
||||||
|
if (is_enable_mem_reuse) {
|
||||||
|
mem_manager_->InitReuseDynamicMemory(graph);
|
||||||
|
mem_flag = kReuseDynamicMem;
|
||||||
|
}
|
||||||
auto &kernels = graph->execution_order();
|
auto &kernels = graph->execution_order();
|
||||||
for (auto &kernel : kernels) {
|
for (auto &kernel : kernels) {
|
||||||
AssignNodeOutputMem(kDynamicMem, kernel, kGetAllOuts);
|
AssignNodeOutputMem(mem_flag, kernel, kGetAllOuts);
|
||||||
AssignWorkSpaceMem(kernel);
|
AssignWorkSpaceMem(mem_flag, kernel);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void KernelRuntime::ReuseAssignDynamicMemory(session::KernelGraph *graph) {
|
void KernelRuntime::AssignWorkSpaceMem(int flag, const AnfNodePtr &node) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
|
||||||
dynamic_mem_offset_ = 0;
|
|
||||||
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
|
|
||||||
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
|
|
||||||
// set all infos
|
|
||||||
mem_reuse_util_ptr->SetAllInfo(graph);
|
|
||||||
auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
|
||||||
MS_EXCEPTION_IF_NULL(bestfit_mem_reuse);
|
|
||||||
bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
|
|
||||||
size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize();
|
|
||||||
MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]";
|
|
||||||
mem_reuse_util_ptr_ = mem_reuse_util_ptr;
|
|
||||||
auto base_ptr = MallocDynamicMem(total_allocated_size, false);
|
|
||||||
mem_reuse_util_ptr_->set_mem_base(base_ptr);
|
|
||||||
auto &kernels = graph->execution_order();
|
|
||||||
for (auto &kernel : kernels) {
|
|
||||||
AssignNodeOutputMem(kReuseDynamicMem, kernel, kGetAllOuts);
|
|
||||||
AssignReuseWorkSpaceMem(kernel);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void KernelRuntime::AssignReuseWorkSpaceMem(const AnfNodePtr &node) {
|
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_manager_);
|
||||||
auto kernel_mod = AnfAlgo::GetKernelMod(node);
|
auto kernel_mod = AnfAlgo::GetKernelMod(node);
|
||||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
MS_EXCEPTION_IF_NULL(kernel_mod);
|
||||||
size_t index = 0;
|
size_t index = 0;
|
||||||
for (auto &size : kernel_mod->GetWorkspaceSizeList()) {
|
for (auto &size : kernel_mod->GetWorkspaceSizeList()) {
|
||||||
auto wk_ptr = mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
|
auto ptr = mem_manager_->MallocWorkSpaceMem(node, flag, index, size);
|
||||||
AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(wk_ptr, size, "", kTypeUnknown), index, node.get());
|
|
||||||
index++;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void KernelRuntime::AssignWorkSpaceMem(const AnfNodePtr &node) {
|
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
|
||||||
if (node->isa<CNode>()) {
|
|
||||||
auto kernel_mod = AnfAlgo::GetKernelMod(node);
|
|
||||||
MS_EXCEPTION_IF_NULL(kernel_mod);
|
|
||||||
size_t index = 0;
|
|
||||||
for (auto &size : kernel_mod->GetWorkspaceSizeList()) {
|
|
||||||
auto ptr = MallocDynamicMem(size, false);
|
|
||||||
AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get());
|
AnfAlgo::SetWorkspaceAddr(CreateDeviceAddress(ptr, size, "", kTypeUnknown), index, node.get());
|
||||||
index++;
|
index++;
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
}
|
|
||||||
|
|
||||||
bool KernelRuntime::IsCommunicationOp(const AnfNodePtr &node) {
|
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
|
||||||
auto kernel_name = AnfAlgo::GetCNodeName(node);
|
|
||||||
auto kernel_type = AnfAlgo::GetKernelType(node);
|
|
||||||
if (kernel_name == kAllReduceOpName || kernel_type == HCCL_KERNEL) {
|
|
||||||
return true;
|
|
||||||
}
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint8_t *KernelRuntime::CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index) {
|
|
||||||
MS_EXCEPTION_IF_NULL(node);
|
|
||||||
auto context_ptr = MsContext::GetInstance();
|
|
||||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
|
||||||
uint8_t *ptr = nullptr;
|
|
||||||
if (IsCommunicationOp(node)) {
|
|
||||||
bool communication_mem = false;
|
|
||||||
if (context_ptr->enable_hccl()) {
|
|
||||||
communication_mem = true;
|
|
||||||
}
|
|
||||||
if (flag == kStaticMem) {
|
|
||||||
ptr = MallocStaticMem(size, communication_mem);
|
|
||||||
} else {
|
|
||||||
ptr = MallocDynamicMem(size, communication_mem);
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
if (flag == kStaticMem) {
|
|
||||||
ptr = MallocStaticMem(size, false);
|
|
||||||
} else if (flag == kDynamicMem) {
|
|
||||||
ptr = MallocDynamicMem(size, false);
|
|
||||||
} else if (flag == kReuseDynamicMem) {
|
|
||||||
ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
|
||||||
}
|
|
||||||
return ptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
|
void KernelRuntime::GenLaunchArgs(const mindspore::kernel::KernelMod &kernel_mod, const mindspore::AnfNodePtr &kernel,
|
||||||
AddressPtrList *kernel_inputs, AddressPtrList *const kernel_workspaces,
|
AddressPtrList *kernel_inputs, AddressPtrList *const kernel_workspaces,
|
||||||
|
@ -659,65 +590,6 @@ bool KernelRuntime::LaunchKernelMod(const session::KernelGraph &graph) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
size_t KernelRuntime::GetCommonAlignSize(size_t input_size) const {
|
|
||||||
return (input_size + mem_align_size_ + 31) / mem_align_size_ * mem_align_size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
size_t KernelRuntime::GetCommunicationAlignSize(size_t input_size) const {
|
|
||||||
return (input_size + mem_align_size_ - 1) / mem_align_size_ * mem_align_size_ + 2 * mem_align_size_;
|
|
||||||
}
|
|
||||||
|
|
||||||
uint8_t *KernelRuntime::MallocStaticMem(size_t size, bool communication_mem) {
|
|
||||||
size_t align_size = 0;
|
|
||||||
if (communication_mem) {
|
|
||||||
align_size = GetCommunicationAlignSize(size);
|
|
||||||
} else {
|
|
||||||
align_size = GetCommonAlignSize(size);
|
|
||||||
}
|
|
||||||
if (static_mem_offset_ < align_size) {
|
|
||||||
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
|
||||||
<< "] static[" << total_static_size_ << "])"
|
|
||||||
<< " malloc [" << align_size << "] failed!";
|
|
||||||
}
|
|
||||||
total_static_size_ += align_size;
|
|
||||||
auto offset = static_mem_offset_ - align_size;
|
|
||||||
if (dynamic_mem_offset_ > offset) {
|
|
||||||
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
|
||||||
<< "] static[" << total_static_size_ << "])"
|
|
||||||
<< " malloc [" << align_size << "] failed!";
|
|
||||||
}
|
|
||||||
static_mem_offset_ = offset;
|
|
||||||
if (communication_mem) {
|
|
||||||
return device_mem_base_ + offset + mem_align_size_;
|
|
||||||
} else {
|
|
||||||
return device_mem_base_ + offset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
uint8_t *KernelRuntime::MallocDynamicMem(size_t size, bool communication_mem) {
|
|
||||||
size_t align_size = 0;
|
|
||||||
if (communication_mem) {
|
|
||||||
align_size = GetCommunicationAlignSize(size);
|
|
||||||
} else {
|
|
||||||
align_size = GetCommonAlignSize(size);
|
|
||||||
}
|
|
||||||
uint64_t offset = dynamic_mem_offset_;
|
|
||||||
auto new_offset = dynamic_mem_offset_ + align_size;
|
|
||||||
if (new_offset > static_mem_offset_) {
|
|
||||||
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
|
||||||
<< "] static[" << total_static_size_ << "])"
|
|
||||||
<< " malloc [" << align_size << "] failed!";
|
|
||||||
}
|
|
||||||
total_dynamic_size_ += align_size;
|
|
||||||
dynamic_mem_offset_ = new_offset;
|
|
||||||
|
|
||||||
if (communication_mem) {
|
|
||||||
return device_mem_base_ + offset + mem_align_size_;
|
|
||||||
} else {
|
|
||||||
return device_mem_base_ + offset;
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
|
bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
if (!LaunchKernelMod(*graph)) {
|
if (!LaunchKernelMod(*graph)) {
|
||||||
|
@ -731,29 +603,6 @@ bool KernelRuntime::LaunchKernel(const session::KernelGraph *graph) {
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
void KernelRuntime::MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag) {
|
|
||||||
if (flag == kStaticMem) {
|
|
||||||
address->ptr_ = MallocStaticMem(size, false);
|
|
||||||
} else if (flag == kDynamicMem) {
|
|
||||||
address->ptr_ = MallocDynamicMem(size, false);
|
|
||||||
} else {
|
|
||||||
MS_LOG(EXCEPTION) << "Unknown memory type!";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
void *KernelRuntime::AllocTensorMemDynamic(size_t size) {
|
|
||||||
if (size == 0) {
|
|
||||||
MS_LOG(ERROR) << "AllocTensorMemDynamic size is 0.";
|
|
||||||
}
|
|
||||||
return nullptr;
|
|
||||||
}
|
|
||||||
|
|
||||||
void KernelRuntime::FreeTensorMemDynamic(void *device_ptr) {
|
|
||||||
if (device_ptr == nullptr) {
|
|
||||||
MS_LOG(ERROR) << "FreeTensorMemDynamic device_ptr is null.";
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
#ifdef ENABLE_DUMP_E2E
|
#ifdef ENABLE_DUMP_E2E
|
||||||
bool KernelRuntime::SetDumpConf() {
|
bool KernelRuntime::SetDumpConf() {
|
||||||
dump_conf_ptr_ = std::make_shared<Dump>();
|
dump_conf_ptr_ = std::make_shared<Dump>();
|
||||||
|
|
|
@ -20,8 +20,7 @@
|
||||||
#include <memory>
|
#include <memory>
|
||||||
#include <string>
|
#include <string>
|
||||||
#include <map>
|
#include <map>
|
||||||
#include "pre_activate/mem_reuse/mem_reuse.h"
|
|
||||||
#include "pre_activate/mem_reuse/mem_reuse_allocator.h"
|
|
||||||
#include "device/device_address.h"
|
#include "device/device_address.h"
|
||||||
#include "ir/meta_tensor.h"
|
#include "ir/meta_tensor.h"
|
||||||
#include "predict/generator/utils/ir_model_util.h"
|
#include "predict/generator/utils/ir_model_util.h"
|
||||||
|
@ -32,21 +31,16 @@
|
||||||
#include "session/anf_runtime_algorithm.h"
|
#include "session/anf_runtime_algorithm.h"
|
||||||
#include "kernel/kernel.h"
|
#include "kernel/kernel.h"
|
||||||
#include "utils/context/ms_context.h"
|
#include "utils/context/ms_context.h"
|
||||||
|
#include "device/memory_manager.h"
|
||||||
|
|
||||||
// using mindspore::session::KernelGraph;
|
// using mindspore::session::KernelGraph;
|
||||||
using mindspore::tensor::Tensor;
|
using mindspore::tensor::Tensor;
|
||||||
using TensorPtr = std::shared_ptr<Tensor>;
|
using TensorPtr = std::shared_ptr<Tensor>;
|
||||||
using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr;
|
|
||||||
using mindspore::kernel::AddressPtr;
|
using mindspore::kernel::AddressPtr;
|
||||||
using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
|
using AddressPtrList = std::vector<mindspore::kernel::AddressPtr>;
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
namespace device {
|
namespace device {
|
||||||
const int kStaticMem = 0;
|
|
||||||
const int kDynamicMem = 1;
|
|
||||||
const int kReuseDynamicMem = 2;
|
|
||||||
const int kGetAllOuts = -1;
|
|
||||||
|
|
||||||
class KernelRuntime {
|
class KernelRuntime {
|
||||||
public:
|
public:
|
||||||
KernelRuntime() = default;
|
KernelRuntime() = default;
|
||||||
|
@ -65,7 +59,6 @@ class KernelRuntime {
|
||||||
DumpConfPtr GetDumpConf();
|
DumpConfPtr GetDumpConf();
|
||||||
#endif
|
#endif
|
||||||
virtual bool LoadTask(const session::KernelGraph *graph);
|
virtual bool LoadTask(const session::KernelGraph *graph);
|
||||||
virtual void FreeHostMemory();
|
|
||||||
// for GPU and D to impl
|
// for GPU and D to impl
|
||||||
virtual void ReleaseDeviceRes() {}
|
virtual void ReleaseDeviceRes() {}
|
||||||
void set_device_id(uint32_t device_id) { device_id_ = device_id; }
|
void set_device_id(uint32_t device_id) { device_id_ = device_id; }
|
||||||
|
@ -75,29 +68,17 @@ class KernelRuntime {
|
||||||
TypeId type_id) = 0;
|
TypeId type_id) = 0;
|
||||||
virtual bool SyncStream() = 0;
|
virtual bool SyncStream() = 0;
|
||||||
void AssignStaticMemory(session::KernelGraph *graph);
|
void AssignStaticMemory(session::KernelGraph *graph);
|
||||||
void AssignDynamicMemory(const session::KernelGraph *graph);
|
void AssignDynamicMemory(session::KernelGraph *graph);
|
||||||
void ReuseAssignDynamicMemory(session::KernelGraph *graph);
|
void ReuseAssignDynamicMemory(session::KernelGraph *graph);
|
||||||
void AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index);
|
void AssignNodeOutputMem(int flag, const AnfNodePtr &node, int index);
|
||||||
void AssignWorkSpaceMem(const AnfNodePtr &node);
|
void AssignWorkSpaceMem(int flag, const AnfNodePtr &node);
|
||||||
void AssignReuseWorkSpaceMem(const AnfNodePtr &node);
|
void AssignReuseWorkSpaceMem(const AnfNodePtr &node);
|
||||||
void AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node);
|
void AssignCommunicationNodeOutputMem(int flag, const AnfNodePtr &node);
|
||||||
void UpdateRefNodeOutputMem(const session::KernelGraph *graph);
|
void UpdateRefNodeOutputMem(const session::KernelGraph *graph);
|
||||||
void UpdateCommunicationOpInputMem(const AnfNodePtr &node);
|
void UpdateCommunicationOpInputMem(const AnfNodePtr &node);
|
||||||
bool IsCommunicationOp(const AnfNodePtr &node);
|
|
||||||
size_t GetCommonAlignSize(size_t input_size) const;
|
|
||||||
size_t GetCommunicationAlignSize(size_t input_size) const;
|
|
||||||
|
|
||||||
uint8_t *CalDeviceMem(const AnfNodePtr &node, size_t size, int flag, size_t index);
|
|
||||||
virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem);
|
|
||||||
uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
|
|
||||||
#ifdef ENABLE_DUMP_E2E
|
#ifdef ENABLE_DUMP_E2E
|
||||||
bool SetDumpConf();
|
bool SetDumpConf();
|
||||||
#endif
|
#endif
|
||||||
// Alloc memory use the dynamic memory pool.
|
|
||||||
virtual void *AllocTensorMemDynamic(size_t size);
|
|
||||||
// Free memory use the dynamic memory pool.
|
|
||||||
virtual void FreeTensorMemDynamic(void *device_ptr);
|
|
||||||
virtual void MallocOpMemory(const DeviceAddressPtr address, size_t size, int flag);
|
|
||||||
|
|
||||||
private:
|
private:
|
||||||
void AssignStaticMemoryOutput(const session::KernelGraph *graph);
|
void AssignStaticMemoryOutput(const session::KernelGraph *graph);
|
||||||
|
@ -114,20 +95,11 @@ class KernelRuntime {
|
||||||
|
|
||||||
protected:
|
protected:
|
||||||
uint32_t device_id_{0};
|
uint32_t device_id_{0};
|
||||||
uint8_t *device_mem_base_{nullptr};
|
|
||||||
uint8_t *device_mem_pool_base_{nullptr};
|
|
||||||
uint64_t device_mem_size_{0};
|
|
||||||
uint64_t device_mem_pool_size_{0};
|
|
||||||
uint64_t dynamic_mem_offset_{0};
|
|
||||||
uint64_t static_mem_offset_{0};
|
|
||||||
const uint64_t mem_align_size_ = 512;
|
|
||||||
#ifdef ENABLE_DUMP_E2E
|
#ifdef ENABLE_DUMP_E2E
|
||||||
DumpConfPtr dump_conf_ptr_;
|
DumpConfPtr dump_conf_ptr_;
|
||||||
#endif
|
#endif
|
||||||
void *stream_ = nullptr;
|
void *stream_ = nullptr;
|
||||||
size_t total_static_size_ = 0;
|
std::shared_ptr<MemoryManager> mem_manager_{nullptr};
|
||||||
size_t total_dynamic_size_ = 0;
|
|
||||||
MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
|
|
||||||
};
|
};
|
||||||
using KernelRuntimePtr = std::shared_ptr<KernelRuntime>;
|
using KernelRuntimePtr = std::shared_ptr<KernelRuntime>;
|
||||||
} // namespace device
|
} // namespace device
|
||||||
|
|
|
@ -0,0 +1,170 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#include "device/memory_manager.h"
|
||||||
|
#include "session/anf_runtime_algorithm.h"
|
||||||
|
#include "utils/context/ms_context.h"
|
||||||
|
using mindspore::memreuse::BestFitMemReuse;
|
||||||
|
using mindspore::memreuse::MemReuseUtilPtr;
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
MemoryManager::~MemoryManager() {
|
||||||
|
device_mem_base_ = nullptr;
|
||||||
|
device_mem_pool_base_ = nullptr;
|
||||||
|
mem_reuse_util_ptr_ = nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MemoryManager::GetCommonAlignSize(size_t input_size) const {
|
||||||
|
return (input_size + kMemAlignSize + 31) / kMemAlignSize * kMemAlignSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
size_t MemoryManager::GetCommunicationAlignSize(size_t input_size) const {
|
||||||
|
return (input_size + kMemAlignSize - 1) / kMemAlignSize * kMemAlignSize + 2 * kMemAlignSize;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MemoryManager::InitReuseDynamicMemory(session::KernelGraph *graph) {
|
||||||
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
|
MemReuseUtilPtr mem_reuse_util_ptr = std::make_shared<memreuse::MemReuseUtil>();
|
||||||
|
MS_EXCEPTION_IF_NULL(mem_reuse_util_ptr);
|
||||||
|
// set all infos
|
||||||
|
mem_reuse_util_ptr->SetAllInfo(graph);
|
||||||
|
auto bestfit_mem_reuse = std::make_shared<BestFitMemReuse>();
|
||||||
|
MS_EXCEPTION_IF_NULL(bestfit_mem_reuse);
|
||||||
|
bestfit_mem_reuse->Reuse(mem_reuse_util_ptr.get());
|
||||||
|
size_t total_allocated_size = bestfit_mem_reuse->GetAllocatedSize();
|
||||||
|
MS_LOG(INFO) << "TotalReuseDynamicSize [" << total_allocated_size << "]";
|
||||||
|
mem_reuse_util_ptr_ = mem_reuse_util_ptr;
|
||||||
|
auto base_ptr = MallocDynamicMem(total_allocated_size, false);
|
||||||
|
mem_reuse_util_ptr_->set_mem_base(base_ptr);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *MemoryManager::MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size) {
|
||||||
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
auto context_ptr = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||||
|
uint8_t *ptr = nullptr;
|
||||||
|
if (AnfAlgo::IsCommunicationOp(node)) {
|
||||||
|
bool communication_mem = false;
|
||||||
|
if (context_ptr->enable_hccl()) {
|
||||||
|
communication_mem = true;
|
||||||
|
}
|
||||||
|
if (flag == kStaticMem) {
|
||||||
|
ptr = MallocStaticMem(size, communication_mem);
|
||||||
|
} else {
|
||||||
|
ptr = MallocDynamicMem(size, communication_mem);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
if (flag == kStaticMem) {
|
||||||
|
ptr = MallocStaticMem(size, false);
|
||||||
|
} else if (flag == kDynamicMem) {
|
||||||
|
ptr = MallocDynamicMem(size, false);
|
||||||
|
} else if (flag == kReuseDynamicMem) {
|
||||||
|
ptr = mem_reuse_util_ptr_->GetNodeOutputPtr(node, index);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *MemoryManager::MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size) {
|
||||||
|
if (flag == kReuseDynamicMem) {
|
||||||
|
return mem_reuse_util_ptr_->GetNodeWorkSpacePtr(node, index);
|
||||||
|
}
|
||||||
|
return MallocDynamicMem(size, false);
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *MemoryManager::MallocMem(int flag, size_t size) {
|
||||||
|
uint8_t *ptr = nullptr;
|
||||||
|
if (flag == kStaticMem) {
|
||||||
|
ptr = MallocStaticMem(size, false);
|
||||||
|
} else if (flag == kDynamicMem) {
|
||||||
|
ptr = MallocDynamicMem(size, false);
|
||||||
|
}
|
||||||
|
return ptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *MemoryManager::MallocStaticMem(size_t size, bool communication_mem) {
|
||||||
|
size_t align_size = 0;
|
||||||
|
if (communication_mem) {
|
||||||
|
align_size = GetCommunicationAlignSize(size);
|
||||||
|
} else {
|
||||||
|
align_size = GetCommonAlignSize(size);
|
||||||
|
}
|
||||||
|
if (static_mem_offset_ < align_size) {
|
||||||
|
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
||||||
|
<< "] static[" << total_static_size_ << "])"
|
||||||
|
<< " malloc [" << align_size << "] failed!";
|
||||||
|
}
|
||||||
|
total_static_size_ += align_size;
|
||||||
|
auto offset = static_mem_offset_ - align_size;
|
||||||
|
if (dynamic_mem_offset_ > offset) {
|
||||||
|
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
||||||
|
<< "] static[" << total_static_size_ << "])"
|
||||||
|
<< " malloc [" << align_size << "] failed!";
|
||||||
|
}
|
||||||
|
static_mem_offset_ = offset;
|
||||||
|
if (communication_mem) {
|
||||||
|
return device_mem_base_ + offset + kMemAlignSize;
|
||||||
|
} else {
|
||||||
|
return device_mem_base_ + offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
uint8_t *MemoryManager::MallocDynamicMem(size_t size, bool communication_mem) {
|
||||||
|
size_t align_size = 0;
|
||||||
|
if (communication_mem) {
|
||||||
|
align_size = GetCommunicationAlignSize(size);
|
||||||
|
} else {
|
||||||
|
align_size = GetCommonAlignSize(size);
|
||||||
|
}
|
||||||
|
uint64_t offset = dynamic_mem_offset_;
|
||||||
|
auto new_offset = dynamic_mem_offset_ + align_size;
|
||||||
|
if (new_offset > static_mem_offset_) {
|
||||||
|
MS_LOG(EXCEPTION) << "Out of memory!!! total[" << device_mem_size_ << "](dynamic[" << total_dynamic_size_
|
||||||
|
<< "] static[" << total_static_size_ << "])"
|
||||||
|
<< " malloc [" << align_size << "] failed!";
|
||||||
|
}
|
||||||
|
total_dynamic_size_ += align_size;
|
||||||
|
dynamic_mem_offset_ = new_offset;
|
||||||
|
|
||||||
|
if (communication_mem) {
|
||||||
|
return device_mem_base_ + offset + kMemAlignSize;
|
||||||
|
} else {
|
||||||
|
return device_mem_base_ + offset;
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
void MemoryManager::MallocOpMemory(const DeviceAddressPtr address, size_t size) {
|
||||||
|
auto device_ptr = AllocTensorMemDynamic(size);
|
||||||
|
MS_EXCEPTION_IF_NULL(device_ptr);
|
||||||
|
address->ptr_ = device_ptr;
|
||||||
|
address->mem_dynamic_alloc_ = true;
|
||||||
|
}
|
||||||
|
|
||||||
|
void *MemoryManager::AllocTensorMemDynamic(size_t size) {
|
||||||
|
if (size == 0) {
|
||||||
|
MS_LOG(ERROR) << "AllocTensorMemDynamic size is 0.";
|
||||||
|
}
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
|
||||||
|
void MemoryManager::FreeTensorMemDynamic(void *device_ptr) {
|
||||||
|
if (device_ptr == nullptr) {
|
||||||
|
MS_LOG(ERROR) << "FreeTensorMemDynamic device_ptr is null.";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
|
@ -0,0 +1,71 @@
|
||||||
|
/**
|
||||||
|
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||||
|
*
|
||||||
|
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
* you may not use this file except in compliance with the License.
|
||||||
|
* You may obtain a copy of the License at
|
||||||
|
*
|
||||||
|
* http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
*
|
||||||
|
* Unless required by applicable law or agreed to in writing, software
|
||||||
|
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
* See the License for the specific language governing permissions and
|
||||||
|
* limitations under the License.
|
||||||
|
*/
|
||||||
|
|
||||||
|
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
|
||||||
|
#define MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
|
||||||
|
#include <memory>
|
||||||
|
#include "pre_activate/mem_reuse/mem_reuse.h"
|
||||||
|
#include "pre_activate/mem_reuse/mem_reuse_allocator.h"
|
||||||
|
namespace mindspore {
|
||||||
|
namespace device {
|
||||||
|
const int kStaticMem = 0;
|
||||||
|
const int kDynamicMem = 1;
|
||||||
|
const int kReuseDynamicMem = 2;
|
||||||
|
const int kGetAllOuts = -1;
|
||||||
|
const uint64_t kMemAlignSize = 512;
|
||||||
|
using MemReuseUtilPtr = mindspore::memreuse::MemReuseUtilPtr;
|
||||||
|
|
||||||
|
class MemoryManager {
|
||||||
|
public:
|
||||||
|
MemoryManager() = default;
|
||||||
|
virtual ~MemoryManager();
|
||||||
|
|
||||||
|
virtual void MallocDeviceMemory() = 0;
|
||||||
|
virtual void FreeDeviceMemory() = 0;
|
||||||
|
void ResetDynamicMemory() {
|
||||||
|
total_dynamic_size_ = 0;
|
||||||
|
dynamic_mem_offset_ = 0;
|
||||||
|
}
|
||||||
|
|
||||||
|
void InitReuseDynamicMemory(session::KernelGraph *graph);
|
||||||
|
uint8_t *MallocOutputMem(const AnfNodePtr &node, size_t index, int flag, size_t size);
|
||||||
|
uint8_t *MallocWorkSpaceMem(const AnfNodePtr &node, size_t index, int flag, size_t size);
|
||||||
|
virtual uint8_t *MallocMem(int flag, size_t size);
|
||||||
|
|
||||||
|
// Alloc memory use the dynamic memory pool.
|
||||||
|
virtual void *AllocTensorMemDynamic(size_t size);
|
||||||
|
// Free memory use the dynamic memory pool.
|
||||||
|
virtual void FreeTensorMemDynamic(void *device_ptr);
|
||||||
|
virtual void MallocOpMemory(const DeviceAddressPtr address, size_t size);
|
||||||
|
size_t GetCommonAlignSize(size_t input_size) const;
|
||||||
|
size_t GetCommunicationAlignSize(size_t input_size) const;
|
||||||
|
|
||||||
|
protected:
|
||||||
|
virtual uint8_t *MallocStaticMem(size_t size, bool communication_mem);
|
||||||
|
virtual uint8_t *MallocDynamicMem(size_t size, bool communication_mem);
|
||||||
|
uint8_t *device_mem_base_{nullptr};
|
||||||
|
uint8_t *device_mem_pool_base_{nullptr};
|
||||||
|
uint64_t device_mem_size_{0};
|
||||||
|
uint64_t device_mem_pool_size_{0};
|
||||||
|
uint64_t dynamic_mem_offset_{0};
|
||||||
|
uint64_t static_mem_offset_{0};
|
||||||
|
size_t total_static_size_ = 0;
|
||||||
|
size_t total_dynamic_size_ = 0;
|
||||||
|
MemReuseUtilPtr mem_reuse_util_ptr_{nullptr};
|
||||||
|
};
|
||||||
|
} // namespace device
|
||||||
|
} // namespace mindspore
|
||||||
|
#endif // MINDSPORE_MINDSPORE_CCSRC_DEVICE_MEMORY_MANAGER_H_
|
|
@ -857,5 +857,15 @@ void AnfRuntimeAlgorithm::SetNodeInput(const CNodePtr &node, const AnfNodePtr &i
|
||||||
MS_EXCEPTION_IF_NULL(input_node);
|
MS_EXCEPTION_IF_NULL(input_node);
|
||||||
node->set_input(index + 1, input_node);
|
node->set_input(index + 1, input_node);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool AnfRuntimeAlgorithm::IsCommunicationOp(const AnfNodePtr &node) {
|
||||||
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
auto kernel_name = AnfAlgo::GetCNodeName(node);
|
||||||
|
auto kernel_type = AnfAlgo::GetKernelType(node);
|
||||||
|
if (kernel_name == kAllReduceOpName || kernel_type == HCCL_KERNEL) {
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
return false;
|
||||||
|
}
|
||||||
} // namespace session
|
} // namespace session
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -166,6 +166,7 @@ class AnfRuntimeAlgorithm {
|
||||||
static bool IsFeatureMapInput(const AnfNodePtr &node, size_t input_index);
|
static bool IsFeatureMapInput(const AnfNodePtr &node, size_t input_index);
|
||||||
// get real input index for some tbe ops which input order is different between me and tbe impl
|
// get real input index for some tbe ops which input order is different between me and tbe impl
|
||||||
static size_t GetRealInputIndex(const AnfNodePtr &anf_node, const size_t cur_index);
|
static size_t GetRealInputIndex(const AnfNodePtr &anf_node, const size_t cur_index);
|
||||||
|
static bool IsCommunicationOp(const AnfNodePtr &node);
|
||||||
};
|
};
|
||||||
} // namespace session
|
} // namespace session
|
||||||
using AnfAlgo = session::AnfRuntimeAlgorithm;
|
using AnfAlgo = session::AnfRuntimeAlgorithm;
|
||||||
|
|
|
@ -102,10 +102,6 @@ GraphId GPUSession::CompileGraph(const AnfNodePtrList &lst, const AnfNodePtrList
|
||||||
graph->set_execution_order(execution_order);
|
graph->set_execution_order(execution_order);
|
||||||
// Alloc memory, including static memory and dynamic memory
|
// Alloc memory, including static memory and dynamic memory
|
||||||
AllocateMemory(graph.get());
|
AllocateMemory(graph.get());
|
||||||
// Reset memory resource
|
|
||||||
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
|
|
||||||
MS_EXCEPTION_IF_NULL(runtime_instance);
|
|
||||||
runtime_instance->FreeHostMemory();
|
|
||||||
return graph_id;
|
return graph_id;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -85,6 +85,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
"../../../mindspore/ccsrc/kernel/oplib/*.cc"
|
"../../../mindspore/ccsrc/kernel/oplib/*.cc"
|
||||||
"../../../mindspore/ccsrc/kernel/tbe/*.cc"
|
"../../../mindspore/ccsrc/kernel/tbe/*.cc"
|
||||||
"../../../mindspore/ccsrc/device/kernel_runtime.cc"
|
"../../../mindspore/ccsrc/device/kernel_runtime.cc"
|
||||||
|
"../../../mindspore/ccsrc/device/memory_manager.cc"
|
||||||
"../../../mindspore/ccsrc/device/kernel_runtime_manager.cc"
|
"../../../mindspore/ccsrc/device/kernel_runtime_manager.cc"
|
||||||
"../../../mindspore/ccsrc/device/kernel_info.cc"
|
"../../../mindspore/ccsrc/device/kernel_info.cc"
|
||||||
"../../../mindspore/ccsrc/device/ascend/profiling/*.cc"
|
"../../../mindspore/ccsrc/device/ascend/profiling/*.cc"
|
||||||
|
@ -92,6 +93,7 @@ file(GLOB_RECURSE MINDSPORE_SRC_LIST RELATIVE ${CMAKE_CURRENT_SOURCE_DIR}
|
||||||
"../../../mindspore/ccsrc/device/convert_tensor_utils.cc"
|
"../../../mindspore/ccsrc/device/convert_tensor_utils.cc"
|
||||||
"../../../mindspore/ccsrc/device/ascend/kernel_build_ascend.cc"
|
"../../../mindspore/ccsrc/device/ascend/kernel_build_ascend.cc"
|
||||||
"../../../mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc"
|
"../../../mindspore/ccsrc/device/ascend/ascend_kernel_runtime.cc"
|
||||||
|
"../../../mindspore/ccsrc/device/ascend/ascend_memory_manager.cc"
|
||||||
"../../../mindspore/ccsrc/device/ascend/ascend_device_address.cc"
|
"../../../mindspore/ccsrc/device/ascend/ascend_device_address.cc"
|
||||||
"../../../mindspore/ccsrc/device/ascend/ascend_memory_allocator.cc"
|
"../../../mindspore/ccsrc/device/ascend/ascend_memory_allocator.cc"
|
||||||
"../../../mindspore/ccsrc/predict/generator/utils/ir_model_util.cc"
|
"../../../mindspore/ccsrc/predict/generator/utils/ir_model_util.cc"
|
||||||
|
|
Loading…
Reference in New Issue