forked from mindspore-Ecosystem/mindspore
!36194 fix the error info of memory alloc not enough
Merge pull request !36194 from limingqi107/bug_fix3
This commit is contained in:
commit
b7ee317083
|
@ -325,12 +325,24 @@ inline T *GetDeviceAddress(const std::vector<AddressPtr> &addr_list, size_t inde
|
|||
return nullptr;
|
||||
}
|
||||
|
||||
if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
|
||||
MS_LOG(ERROR) << "The device address is empty, address index: " << index << ", and the length of 'addr_list' is "
|
||||
if (addr_list[index] == nullptr) {
|
||||
MS_LOG(ERROR) << "The device address is nullptr, address index: " << index << ", and the length of 'addr_list' is "
|
||||
<< addr_list.size();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (addr_list[index]->addr == nullptr) {
|
||||
MS_LOG(ERROR) << "The memory of device address is nullptr, address index: " << index
|
||||
<< ", and the length of 'addr_list' is " << addr_list.size();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
if (addr_list[index]->size == 0) {
|
||||
MS_LOG(ERROR) << "The size of device address is zero, address index: " << index
|
||||
<< ", and the length of 'addr_list' is " << addr_list.size();
|
||||
return nullptr;
|
||||
}
|
||||
|
||||
return reinterpret_cast<T *>(addr_list[index]->addr);
|
||||
}
|
||||
} // namespace kernel
|
||||
|
|
|
@ -195,10 +195,24 @@ class DeprecatedNativeGpuKernelMod : public NativeGpuKernelMod {
|
|||
MS_LOG(EXCEPTION) << "Address index(" << index << ") out of range(" << addr_list.size() << ")";
|
||||
}
|
||||
|
||||
if ((addr_list[index] == nullptr) || (addr_list[index]->addr == nullptr) || (addr_list[index]->size == 0)) {
|
||||
if (addr_list[index] == nullptr) {
|
||||
auto kernel_node = kernel_node_.lock();
|
||||
const std::string &prim_name = (kernel_node == nullptr ? "" : common::AnfAlgo::GetCNodeName(kernel_node));
|
||||
MS_LOG(EXCEPTION) << "The device address is empty, address index: " << index << ", op name is: " << prim_name;
|
||||
MS_LOG(EXCEPTION) << "The device address is nullptr, address index: " << index << ", op name is: " << prim_name;
|
||||
}
|
||||
|
||||
if (addr_list[index]->addr == nullptr) {
|
||||
auto kernel_node = kernel_node_.lock();
|
||||
const std::string &prim_name = (kernel_node == nullptr ? "" : common::AnfAlgo::GetCNodeName(kernel_node));
|
||||
MS_LOG(EXCEPTION) << "The memory of device address is nullptr, address index: " << index
|
||||
<< ", op name is: " << prim_name;
|
||||
}
|
||||
|
||||
if (addr_list[index]->size == 0) {
|
||||
auto kernel_node = kernel_node_.lock();
|
||||
const std::string &prim_name = (kernel_node == nullptr ? "" : common::AnfAlgo::GetCNodeName(kernel_node));
|
||||
MS_LOG(EXCEPTION) << "The size of device address is zero, address index: " << index
|
||||
<< ", op name is: " << prim_name;
|
||||
}
|
||||
|
||||
return reinterpret_cast<T *>(addr_list[index]->addr);
|
||||
|
|
|
@ -27,6 +27,11 @@ bool ActorDispatcher::is_multi_thread_execution_ = true;
|
|||
bool ActorDispatcher::is_memory_allocation_sync_ = true;
|
||||
bool ActorDispatcher::is_memory_free_sync_ = true;
|
||||
|
||||
bool IsRunningFailed(const OpContext<DeviceTensor> *context) {
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
return (context->error_info_ != "");
|
||||
}
|
||||
|
||||
void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread_num) {
|
||||
MS_EXCEPTION_IF_NULL(actor_thread_num);
|
||||
MS_EXCEPTION_IF_NULL(actor_and_kernel_thread_num);
|
||||
|
|
|
@ -243,6 +243,8 @@ class ActorDispatcher {
|
|||
static bool is_memory_free_sync_;
|
||||
};
|
||||
|
||||
bool IsRunningFailed(const OpContext<DeviceTensor> *context);
|
||||
|
||||
void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread_num);
|
||||
|
||||
bool IsDeviceQueueDSActor(const AnfNodePtr &node, GraphExecutionStrategy strategy = GraphExecutionStrategy::kPipeline);
|
||||
|
|
|
@ -69,6 +69,9 @@ void ExitActor::SendOutput(OpContext<DeviceTensor> *const context) {
|
|||
|
||||
void ExitActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
}
|
||||
|
||||
// 1.Send output in base class.
|
||||
ControlActor::SendOutput(context);
|
||||
|
|
|
@ -80,6 +80,9 @@ void CopyActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
|
|||
MS_EXCEPTION_IF_NULL(context);
|
||||
MS_EXCEPTION_IF_NULL(output_device_tensor_[0]);
|
||||
MS_EXCEPTION_IF_NULL(input_device_tensor_[0]);
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
}
|
||||
|
||||
if (input_device_tensor_[0]->GetSize() != output_device_tensor_[0]->GetSize()) {
|
||||
MS_LOG(WARNING) << GetAID().Name() << " copy size is not equal, input size:" << input_device_tensor_[0]->GetSize()
|
||||
|
|
|
@ -375,20 +375,23 @@ void DataPrepareActor::PrepareData(const std::vector<std::vector<TensorPtr>> &in
|
|||
} else if (strategy_ == GraphExecutionStrategy::kStep) {
|
||||
PrepareDataForStepMode(input_tensors, context);
|
||||
}
|
||||
|
||||
UpdateGraphsRefNodeAddress(graph_compiler_info_->graphs_);
|
||||
|
||||
// Debug actor is blocked, must wait debug actor callback message to process continue.
|
||||
if (debug_aid_ != nullptr && strategy_ == GraphExecutionStrategy::kPipeline) {
|
||||
SendDebugReq(context);
|
||||
return;
|
||||
}
|
||||
} catch (const std::exception &e) {
|
||||
std::string error_info = e.what();
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR_BY_STRATEGY(real_strategy_, (*context), error_info);
|
||||
}
|
||||
}
|
||||
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
}
|
||||
|
||||
UpdateGraphsRefNodeAddress(graph_compiler_info_->graphs_);
|
||||
// Debug actor is blocked, must wait debug actor callback message to process continue.
|
||||
if (debug_aid_ != nullptr && strategy_ == GraphExecutionStrategy::kPipeline) {
|
||||
SendDebugReq(context);
|
||||
return;
|
||||
}
|
||||
|
||||
// Allocate continuous memory and send output to trigger the step running.
|
||||
if (continuous_memory_alloc_list_list_.size() > 0) {
|
||||
SendMemoryAllocReq(context);
|
||||
|
@ -429,6 +432,10 @@ void DataPrepareActor::SendMemoryAllocReq(OpContext<DeviceTensor> *const context
|
|||
|
||||
void DataPrepareActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
}
|
||||
|
||||
PostRun(context);
|
||||
}
|
||||
|
||||
|
|
|
@ -128,6 +128,9 @@ void DeviceQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *co
|
|||
MS_EXCEPTION_IF_NULL(context);
|
||||
MS_EXCEPTION_IF_NULL(data_kernel_);
|
||||
MS_EXCEPTION_IF_NULL(device_contexts_[0]);
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
}
|
||||
if (buffers_.size() == 0) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The data queue is empty.");
|
||||
}
|
||||
|
@ -241,6 +244,9 @@ void HostQueueDataSourceActor::SendMemoryFreeReq(OpContext<DeviceTensor> *const
|
|||
|
||||
void HostQueueDataSourceActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
|
||||
MS_EXCEPTION_IF_NULL(context);
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
}
|
||||
if (buffers_.size() == 0) {
|
||||
SET_OPCONTEXT_FAIL_RET_WITH_ERROR((*context), "The data queue is empty.");
|
||||
}
|
||||
|
|
|
@ -233,6 +233,9 @@ void KernelActor::OnMemoryAllocFinish(OpContext<DeviceTensor> *const context) {
|
|||
MS_EXCEPTION_IF_NULL(context);
|
||||
MS_EXCEPTION_IF_NULL(kernel_);
|
||||
MS_EXCEPTION_IF_NULL(device_contexts_[0]);
|
||||
if (IsRunningFailed(context)) {
|
||||
return;
|
||||
}
|
||||
PreLaunchKernel(context);
|
||||
|
||||
try {
|
||||
|
|
Loading…
Reference in New Issue