forked from mindspore-Ecosystem/mindspore
!3627 fix: device occupied tdt hung
Merge pull request !3627 from guozhijian/fix_device_occupied_tdt_hung
This commit is contained in:
commit
44e739ae31
|
@ -194,17 +194,19 @@ bool MsContext::OpenTsd() {
|
||||||
}
|
}
|
||||||
|
|
||||||
MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
|
MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
|
||||||
|
|
||||||
|
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
|
||||||
|
if (status != TDT_OK) {
|
||||||
|
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
|
||||||
int32_t initStatus = tdt::TdtHostInit(device_id);
|
int32_t initStatus = tdt::TdtHostInit(device_id);
|
||||||
if (initStatus != TDT_OK_CODE) {
|
if (initStatus != TDT_OK_CODE) {
|
||||||
MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";
|
MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";
|
||||||
return false;
|
return false;
|
||||||
}
|
}
|
||||||
tdt_print_ = std::thread(TensorPrint());
|
tdt_print_ = std::thread(TensorPrint());
|
||||||
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
|
|
||||||
if (status != TDT_OK) {
|
|
||||||
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
|
|
||||||
return false;
|
|
||||||
}
|
|
||||||
tsd_ref_++;
|
tsd_ref_++;
|
||||||
MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " << tsd_ref_ << ".";
|
MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " << tsd_ref_ << ".";
|
||||||
return true;
|
return true;
|
||||||
|
|
Loading…
Reference in New Issue