!18459 add device id info when memory alloc failed

Merge pull request !18459 from limingqi107/bug_fix
This commit is contained in:
i-robot 2021-06-17 17:14:55 +08:00 committed by Gitee
commit 8128f8ffe2
2 changed files with 21 additions and 6 deletions

View File

@ -894,7 +894,11 @@ bool GPUKernelRuntime::AddMemorySwapTask(const AnfNodePtr &kernel, bool mock, bo
device_address->set_status(DeviceAddressStatus::kInDevice);
} else if (status == DeviceAddressStatus::kInHost) {
if (!device_address->ptr_ && !AttemptMallocMem(device_address, device_address->size_, mock)) {
return false;
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
MS_LOG(EXCEPTION) << "Device(id:" << device_id
<< ") memory isn't enough and alloc failed, alloc size:" << device_address->size_;
}
float cost_time = 0;
mem_swap_manager_->AddMemSwapTask(SwapKind::kHostToDevice, device_address, host_address, mock, profiling,
@ -1072,7 +1076,11 @@ bool GPUKernelRuntime::AllocKernelOutputDynamicRes(const mindspore::kernel::Kern
auto device_address = GetMutableOutputAddr(kernel, i, false);
MS_EXCEPTION_IF_NULL(device_address);
if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, output_sizes[i], mock)) {
return false;
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
MS_LOG(EXCEPTION) << "Device(id:" << device_id
<< ") memory isn't enough and alloc failed, alloc size:" << output_sizes[i];
}
kernel::AddressPtr output = std::make_shared<kernel::Address>();
MS_EXCEPTION_IF_NULL(output);
@ -1096,7 +1104,11 @@ bool GPUKernelRuntime::AllocKernelWorkspaceDynamicRes(const mindspore::kernel::K
}
auto device_address = AnfAlgo::GetMutableWorkspaceAddr(kernel, i);
if (device_address->ptr_ == nullptr && !AttemptMallocMem(device_address, workspace_sizes[i], mock)) {
return false;
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto device_id = context_ptr->get_param<uint32_t>(MS_CTX_DEVICE_ID);
MS_LOG(EXCEPTION) << "Device(id:" << device_id
<< ") memory isn't enough and alloc failed, alloc size:" << workspace_sizes[i];
}
kernel::AddressPtr workspace = std::make_shared<kernel::Address>();
MS_EXCEPTION_IF_NULL(workspace);

View File

@ -35,7 +35,8 @@ void MemoryManagerActor::AllocateMemory(std::vector<DeviceTensor *> *alloc_list,
}
// Allocate memory through the device context.
if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
std::string error_info = "Device memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
std::string error_info = "Device(id:" + std::to_string(device_context->device_context_key().device_id_) +
") memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
", alloc size: " + std::to_string(device_tensor->GetSize());
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), error_info);
}
@ -69,7 +70,8 @@ void MemoryManagerActor::AllocateContinuousMemory(std::vector<std::vector<Device
auto &device_context = (*device_contexts)[i];
// Allocate memory through the device context.
if (!device_context->AllocateContinuousMemory(alloc_list, total_size, size_list)) {
std::string error_info = "Device memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
std::string error_info = "Device(id:" + std::to_string(device_context->device_context_key().device_id_) +
") memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
", alloc size: " + std::to_string(total_size);
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), error_info);
}
@ -101,7 +103,8 @@ void MemoryManagerActor::AllocateBatchMemory(std::vector<DeviceTensor *> *alloc_
// Allocate memory through the device context.
if (!device_context->AllocateMemory(device_tensor, device_tensor->GetSize())) {
std::string error_info = "Device memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
std::string error_info = "Device(id:" + std::to_string(device_context->device_context_key().device_id_) +
") memory isn't enough and alloc failed, actor name: " + from_aid.Name() +
", alloc size: " + std::to_string(device_tensor->GetSize());
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*op_context), error_info);
}