forked from mindspore-Ecosystem/mindspore
!18459 add device id info when memory alloc failed
Merge pull request !18459 from limingqi107/bug_fix
This commit is contained in:
commit
8128f8ffe2
|
@ -894,7 +894,11 @@ bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bo
|
|||
device_address->set_status(DeviceAddressStatus::kInDevice);
|
||||
} else if (status == DeviceAddressStatus::kInHost) {
|
||||
if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_, mock)) {
|
||||
return false;
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
MS_LOG(EXCEPTION) << "Device(id:" << device_id
|
||||
<< ") memory isn't enough and alloc failed, alloc size:" << device_address->size_;
|
||||
}
|
||||
float cost_time = 0;
|
||||
mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address, mock, profiling,
|
||||
|
@ -1072,7 +1076,11 @@ bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::Kern
|
|||
auto device_address = GetMutableOutputAddr(kernel, i, false);
|
||||
MS_EXCEPTION_IF_NULL(device_address);
|
||||
if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) {
|
||||
return false;
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
MS_LOG(EXCEPTION) << "Device(id:" << device_id
|
||||
<< ") memory isn't enough and alloc failed, alloc size:" << output_sizes[i];
|
||||
}
|
||||
kernel::AddressPtr output = std::make_shared<kernel::Address>();
|
||||
MS_EXCEPTION_IF_NULL(output);
|
||||
|
@ -1096,7 +1104,11 @@ bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::K
|
|||
}
|
||||
auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
|
||||
if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i], mock)) {
|
||||
return false;
|
||||
auto context_ptr = MsContext::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(context_ptr);
|
||||
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
|
||||
MS_LOG(EXCEPTION) << "Device(id:" << device_id
|
||||
<< ") memory isn't enough and alloc failed, alloc size:" << workspace_sizes[i];
|
||||
}
|
||||
kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
|
||||
MS_EXCEPTION_IF_NULL(workspace);
|
||||
|
|
|
@ -35,7 +35,8 @@ void MemoryManagerActor::AllocateMemory(std::vector<DeviceTensor *> *alloc_list,
|
|||
}
|
||||
// Allocate memory through the device context.
|
||||
if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
|
||||
std::string error_info = "Device memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
|
||||
std::string error_info = "Device(id:" + std::to_string(device_context->device_context_key().device_id_) +
|
||||
") memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
|
||||
", alloc size: " + std::to_string(device_tensor->GetSize());
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), error_info);
|
||||
}
|
||||
|
@ -69,7 +70,8 @@ void MemoryManagerActor::AllocateContinuousMemory(std::vector<std::vector<Device
|
|||
auto &device_context = (*device_contexts)[i];
|
||||
// Allocate memory through the device context.
|
||||
if (!device_context->AllocateContinuousMemory(alloc_list, total_size, size_list)) {
|
||||
std::string error_info = "Device memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
|
||||
std::string error_info = "Device(id:" + std::to_string(device_context->device_context_key().device_id_) +
|
||||
") memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
|
||||
", alloc size: " + std::to_string(total_size);
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), error_info);
|
||||
}
|
||||
|
@ -101,7 +103,8 @@ void MemoryManagerActor::AllocateBatchMemory(std::vector<DeviceTensor *> *alloc_
|
|||
|
||||
// Allocate memory through the device context.
|
||||
if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
|
||||
std::string error_info = "Device memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
|
||||
std::string error_info = "Device(id:" + std::to_string(device_context->device_context_key().device_id_) +
|
||||
") memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
|
||||
", alloc size: " + std::to_string(device_tensor->GetSize());
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), error_info);
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue