!3627 fix: device occupied tdt hung

Merge pull request !3627 from guozhijian/fix_device_occupied_tdt_hung
This commit is contained in:
mindspore-ci-bot 2020-07-29 14:40:35 +08:00 committed by Gitee
commit 44e739ae31
1 changed files with 7 additions and 5 deletions

View File

@ -194,17 +194,19 @@ bool MsContext::OpenTsd() {
}
MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}
int32_t initStatus = tdt::TdtHostInit(device_id);
if (initStatus != TDT_OK_CODE) {
MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";
return false;
}
tdt_print_ = std::thread(TensorPrint());
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}
tsd_ref_++;
MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " << tsd_ref_ << ".";
return true;