!36194 fix the error info of memory alloc not enough

Merge pull request !36194 from limingqi107/bug_fix3
This commit is contained in:
i-robot 2022-06-20 17:16:43 +00:00 committed by Gitee
commit b7ee317083
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
9 changed files with 67 additions and 12 deletions

View File

@ -325,12 +325,24 @@ inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t inde
return nullptr;
}
if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
MS_LOG(ERROR) << "The device address is empty, address index: " << index << ", and the length of 'addr_list' is "
if (addr_list[index] == nullptr) {
MS_LOG(ERROR) << "The device address is nullptr, address index: " << index << ", and the length of 'addr_list' is "
<< addr_list.size();
return nullptr;
}
if (addr_list[index]->addr == nullptr) {
MS_LOG(ERROR) << "The memory of device address is nullptr, address index: " << index
<< ", and the length of 'addr_list' is " << addr_list.size();
return nullptr;
}
if (addr_list[index]->size == 0) {
MS_LOG(ERROR) << "The size of device address is zero, address index: " << index
<< ", and the length of 'addr_list' is " << addr_list.size();
return nullptr;
}
return reinterpret_cast<T *>(addr_list[index]->addr);
}
} // namespace kernel

View File

@ -195,10 +195,24 @@ class DeprecatedNativeGpuKernelMod : public NativeGpuKernelMod {
MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
}
if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
if (addr_list[index] == nullptr) {
auto kernel_node = kernel_node_.lock();
const std::string &prim_name = (kernel_node == nullptr ? "" : common::AnfAlgo::GetCNodeName(kernel_node));
MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index << ", op name is: " << prim_name;
MS_LOG(EXCEPTION) << "The device address is nullptr, address index: " << index << ", op name is: " << prim_name;
}
if (addr_list[index]->addr == nullptr) {
auto kernel_node = kernel_node_.lock();
const std::string &prim_name = (kernel_node == nullptr ? "" : common::AnfAlgo::GetCNodeName(kernel_node));
MS_LOG(EXCEPTION) << "The memory of device address is nullptr, address index: " << index
<< ", op name is: " << prim_name;
}
if (addr_list[index]->size == 0) {
auto kernel_node = kernel_node_.lock();
const std::string &prim_name = (kernel_node == nullptr ? "" : common::AnfAlgo::GetCNodeName(kernel_node));
MS_LOG(EXCEPTION) << "The size of device address is zero, address index: " << index
<< ", op name is: " << prim_name;
}
return reinterpret_cast<T *>(addr_list[index]->addr);

View File

@ -27,6 +27,11 @@ bool ActorDispatcher::is_multi_thread_execution_ = true;
bool ActorDispatcher::is_memory_allocation_sync_ = true;
bool ActorDispatcher::is_memory_free_sync_ = true;
bool IsRunningFailed(const OpContext<DeviceTensor> *context) {
MS_EXCEPTION_IF_NULL(context);
return (context->error_info_ != "");
}
void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread_num) {
MS_EXCEPTION_IF_NULL(actor_thread_num);
MS_EXCEPTION_IF_NULL(actor_and_kernel_thread_num);

View File

@ -243,6 +243,8 @@ class ActorDispatcher {
static bool is_memory_free_sync_;
};
bool IsRunningFailed(const OpContext<DeviceTensor> *context);
void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread_num);
bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);

View File

@ -69,6 +69,9 @@ void ExitActor::SendOutput(OpContext<DeviceTensor> *const context) {
void ExitActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
MS_EXCEPTION_IF_NULL(context);
if (IsRunningFailed(context)) {
return;
}
// 1.Send output in base class.
ControlActor::SendOutput(context);

View File

@ -80,6 +80,9 @@ void CopyActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
MS_EXCEPTION_IF_NULL(context);
MS_EXCEPTION_IF_NULL(output_device_tensor_[0]);
MS_EXCEPTION_IF_NULL(input_device_tensor_[0]);
if (IsRunningFailed(context)) {
return;
}
if (input_device_tensor_[0]->GetSize() != output_device_tensor_[0]->GetSize()) {
MS_LOG(WARNING) << GetAID().Name() << " copy size is not equal, input size:" << input_device_tensor_[0]->GetSize()

View File

@ -375,20 +375,23 @@ void DataPrepareActor::PrepareData(const std::vector<std::vector<TensorPtr>> &in
} else if (strategy_ == GraphExecutionStrategy::kStep) {
PrepareDataForStepMode(input_tensors, context);
}
UpdateGraphsRefNodeAddress(graph_compiler_info_->graphs_);
// Debug actor is blocked, must wait debug actor callback message to process continue.
if (debug_aid_ != nullptr && strategy_ == GraphExecutionStrategy::kPipeline) {
SendDebugReq(context);
return;
}
} catch (const std::exception &e) {
std::string error_info = e.what();
SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(real_strategy_, (*context), error_info);
}
}
if (IsRunningFailed(context)) {
return;
}
UpdateGraphsRefNodeAddress(graph_compiler_info_->graphs_);
// Debug actor is blocked, must wait debug actor callback message to process continue.
if (debug_aid_ != nullptr && strategy_ == GraphExecutionStrategy::kPipeline) {
SendDebugReq(context);
return;
}
// Allocate continuous memory and send output to trigger the step running.
if (continuous_memory_alloc_list_list_.size() > 0) {
SendMemoryAllocReq(context);
@ -429,6 +432,10 @@ void DataPrepareActor::SendMemoryAllocReq(OpContext<DeviceTensor> *const context
void DataPrepareActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
MS_EXCEPTION_IF_NULL(context);
if (IsRunningFailed(context)) {
return;
}
PostRun(context);
}

View File

@ -128,6 +128,9 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
MS_EXCEPTION_IF_NULL(context);
MS_EXCEPTION_IF_NULL(data_kernel_);
MS_EXCEPTION_IF_NULL(device_contexts_[0]);
if (IsRunningFailed(context)) {
return;
}
if (buffers_.size() == 0) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The data queue is empty.");
}
@ -241,6 +244,9 @@ void HostQueueDataSourceActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const
void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
MS_EXCEPTION_IF_NULL(context);
if (IsRunningFailed(context)) {
return;
}
if (buffers_.size() == 0) {
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The data queue is empty.");
}

View File

@ -233,6 +233,9 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
MS_EXCEPTION_IF_NULL(context);
MS_EXCEPTION_IF_NULL(kernel_);
MS_EXCEPTION_IF_NULL(device_contexts_[0]);
if (IsRunningFailed(context)) {
return;
}
PreLaunchKernel(context);
try {