modify push failed log

This commit is contained in:
ms_yan 2022-01-22 11:48:45 +08:00
parent 9c8a4279c8
commit 29120febaf
2 changed files with 37 additions and 21 deletions

View File

@ -224,6 +224,12 @@ Status DeviceQueueOp::SendDataToAscend() {
DetectPerBatchTime(&batch_record_start, &batch_record_end);
#endif
PrintBeginInfoWhenFirstBatch(first_push_flag_);
// when training stopped, handle might have been destroyed immediately
if (tdtInstancePtr->acl_handle_ == nullptr) {
MS_LOG(WARNING) << "Thread has already been terminated.";
is_break_loop = true;
continue;
}
RETURN_IF_NOT_OK(SendRowToTdt(curr_row, is_profiling_enable, &tdt_cost));
PrintEndInfoWhenFirstBatch(&first_push_flag_);
#ifndef ENABLE_SECURITY
@ -258,20 +264,8 @@ Status DeviceQueueOp::SendDataToAscend() {
TensorRow dummy_row;
auto status = tdtInstancePtr->hostPush(dummy_row, true, channel_name_, is_profiling_enable, tdt_cost,
ACL_TENSOR_DATA_END_OF_SEQUENCE);
if (status != Status::OK()) {
if (stop_send_) {
send_finished_ = true;
MS_LOG(INFO) << "stop_send received";
return Status::OK();
}
return Status(StatusCode::kMDTDTPushFailure,
"TDT Push data into device Failed, check the first error or TraceBack first, following are"
" several possible checking way: 1) if training is not ready, still in network graph compiling"
" stage, check error raised by Network used operator or environment configuration. 2) if"
" interrupt in middle process of training, may check whether dataset sending num and network"
" training num mismatch. 3) if this error raised in end of training, ignore this. 4) other cases,"
" try find ascend host log or checking info log etc or search this in mindspore's FAQ.");
}
RETURN_IF_NOT_OK(CheckPushStatus(status, stop_send_, &send_finished_, &is_break_loop));
MS_LOG(INFO) << "an epoch has already sent, now stop send data.";
stop_send_ = true;
}
@ -322,13 +316,11 @@ Status DeviceQueueOp::SendRowToTdt(TensorRow curr_row, bool is_profiling_enable,
MS_LOG(INFO) << "stop_send received";
return Status::OK();
}
return Status(StatusCode::kMDTDTPushFailure,
"TDT Push data into device Failed, check the first error or TraceBack first, following are"
" several possible checking way: 1) if training is not ready, still in network graph compiling"
" stage, check error raised by Network used operator or environment configuration. 2) if"
" interrupt in middle process of training, may check whether dataset sending num and network"
" training num mismatch. 3) if this error raised in end of training, ignore this. 4) other cases,"
" try find ascend host log or checking info log ects or search this in mindspore's FAQ.");
return Status(
StatusCode::kMDTDTPushFailure,
"TDT Push data into device Failed, check the first error or TraceBack first, more checking advises are: "
"1) if training is not ready, error might raised by network computing operator or environment configuration. "
"2) other cases, checking info level log or search this error in mindspore's FAQ for detail solution.");
}
if (create_data_info_queue_) {
DATA_INFO data_info;
@ -338,6 +330,28 @@ Status DeviceQueueOp::SendRowToTdt(TensorRow curr_row, bool is_profiling_enable,
}
return Status::OK();
}
Status DeviceQueueOp::CheckPushStatus(Status status, bool stop_send, bool *send_finished, bool *is_break_loop) {
if (status != Status::OK()) {
if (stop_send) {
*send_finished = true;
MS_LOG(INFO) << "stop_send received";
return Status::OK();
}
// when training stopped, handle might have been destroyed immediately
if (tdtInstancePtr->acl_handle_ == nullptr) {
*is_break_loop = true;
MS_LOG(WARNING) << "Thread has already been terminated.";
return Status::OK();
}
return Status(
StatusCode::kMDTDTPushFailure,
"TDT Push data into device Failed, check the first error or TraceBack first, more checking advises are: "
"1) if training is not ready, error might raised by network computing operator or environment configuration. "
"2) other cases, checking info level log or search this error in mindspore's FAQ for detail solution.");
}
return Status::OK();
}
#endif
#ifdef ENABLE_TDTQUE

View File

@ -135,6 +135,8 @@ class DeviceQueueOp : public PipelineOp {
Status SendDataToAscend();
void LimitSendingBatches(int64_t send_batch, int64_t *sending_num, std::shared_ptr<ConfigManager> cfg);
Status SendRowToTdt(TensorRow curr_row, bool is_profiling_enable, int32_t *tdt_cost);
// check status that push data into device
Status CheckPushStatus(Status status, bool stop_send, bool *send_finished, bool *is_break_loop);
bool ascend_keep_waiting_;
#endif