fix: device occupied tdt hung

This commit is contained in:
jonyguo 2020-07-29 09:45:54 +08:00
parent 9e1244934c
commit b9d855cbca
1 changed files with 7 additions and 5 deletions

View File

@ -194,17 +194,19 @@ bool MsContext::OpenTsd() {
}
MS_LOG(INFO) << "Device id = " << device_id << ", rank size = " << rank_size << ".";
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}
int32_t initStatus = tdt::TdtHostInit(device_id);
if (initStatus != TDT_OK_CODE) {
MS_LOG(EXCEPTION) << "Init tsd failed, status = " << initStatus << ".";
return false;
}
tdt_print_ = std::thread(TensorPrint());
TDT_StatusT status = tdt::TsdClient::GetInstance()->Open(device_id, rank_size);
if (status != TDT_OK) {
MS_LOG(EXCEPTION) << "Device " << device_id << " is occupied, open tsd failed, status = " << status << ".";
return false;
}
tsd_ref_++;
MS_LOG(INFO) << "Open and init tsd successful, tsd reference = " << tsd_ref_ << ".";
return true;