forked from mindspore-Ecosystem/mindspore
!18267 Fix bug in gpu dump and ascend distribution training dump
Merge pull request !18267 from TinaMengtingZhang/bugfix-1
This commit is contained in:
commit
d45b39290b
|
@ -835,8 +835,10 @@ void AscendSession::InitRuntimeResource() {
|
|||
if (!runtime_instance->Init()) {
|
||||
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
|
||||
}
|
||||
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
|
||||
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
|
||||
auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
|
||||
auto env_rank_id = common::GetEnv("RANK_ID");
|
||||
if (!(env_table_file.empty() || env_rank_id.empty())) {
|
||||
// get actual rank id if it's distribution training case.
|
||||
rank_id_ = GetRankId();
|
||||
}
|
||||
DumpInit(rank_id_);
|
||||
|
|
|
@ -206,10 +206,12 @@ bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, s
|
|||
return false;
|
||||
}
|
||||
std::string npy_header = GenerateNpyHeader(shape, type);
|
||||
fd << npy_header;
|
||||
(void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
|
||||
fd.close();
|
||||
ChangeFileMode(file_path, S_IRUSR);
|
||||
if (!npy_header.empty()) {
|
||||
fd << npy_header;
|
||||
(void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
|
||||
fd.close();
|
||||
ChangeFileMode(file_path, S_IRUSR);
|
||||
}
|
||||
return true;
|
||||
}
|
||||
|
||||
|
|
|
@ -109,7 +109,8 @@ std::string GenerateNpyHeader(const ShapeVector &shape, TypeId type_id, bool for
|
|||
};
|
||||
auto type_desc = type_desc_map.find(type_id);
|
||||
if (type_desc == type_desc_map.end()) {
|
||||
MS_LOG(EXCEPTION) << "Not support dump the " << TypeIdToType(type_id)->ToString() << " data to npy file.";
|
||||
MS_LOG(WARNING) << "Not support dump the " << TypeIdToType(type_id)->ToString() << " data to npy file.";
|
||||
return std::string();
|
||||
}
|
||||
|
||||
NpyHeader npy_header{type_desc->second, fortran_order, shape};
|
||||
|
|
|
@ -140,9 +140,10 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
|
|||
}
|
||||
uint32_t graph_id = kernel_graph_->graph_id();
|
||||
uint32_t rank_id = 0;
|
||||
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
|
||||
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
|
||||
// get actual rank id if hcck is initiated.
|
||||
auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
|
||||
auto env_rank_id = common::GetEnv("RANK_ID");
|
||||
if (!(env_table_file.empty() || env_rank_id.empty())) {
|
||||
// get actual rank id if it's distribution training case.
|
||||
if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
|
||||
MS_LOG(INFO) << "Failed to get rank id.";
|
||||
}
|
||||
|
|
Loading…
Reference in New Issue