!18267 Fix bug in gpu dump and ascend distribution training dump

Merge pull request !18267 from TinaMengtingZhang/bugfix-1
This commit is contained in:
i-robot 2021-06-15 21:29:29 +08:00 committed by Gitee
commit d45b39290b
4 changed files with 16 additions and 10 deletions

View File

@ -835,8 +835,10 @@ void AscendSession::InitRuntimeResource() {
if (!runtime_instance->Init()) {
MS_LOG(EXCEPTION) << "Kernel runtime init error.";
}
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
auto env_rank_id = common::GetEnv("RANK_ID");
if (!(env_table_file.empty() || env_rank_id.empty())) {
// get actual rank id if it's distribution training case.
rank_id_ = GetRankId();
}
DumpInit(rank_id_);

View File

@ -206,10 +206,12 @@ bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, s
return false;
}
std::string npy_header = GenerateNpyHeader(shape, type);
fd << npy_header;
(void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
fd.close();
ChangeFileMode(file_path, S_IRUSR);
if (!npy_header.empty()) {
fd << npy_header;
(void)fd.write(reinterpret_cast<const char *>(data), SizeToLong(len));
fd.close();
ChangeFileMode(file_path, S_IRUSR);
}
return true;
}

View File

@ -109,7 +109,8 @@ std::string GenerateNpyHeader(const ShapeVector &shape, TypeId type_id, bool for
};
auto type_desc = type_desc_map.find(type_id);
if (type_desc == type_desc_map.end()) {
MS_LOG(EXCEPTION) << "Not support dump the " << TypeIdToType(type_id)->ToString() << " data to npy file.";
MS_LOG(WARNING) << "Not support dump the " << TypeIdToType(type_id)->ToString() << " data to npy file.";
return std::string();
}
NpyHeader npy_header{type_desc->second, fortran_order, shape};

View File

@ -140,9 +140,10 @@ void DataDumper::SetOpMappingInfo(NotNull<aicpu::dump::OpMappingInfo *> dump_inf
}
uint32_t graph_id = kernel_graph_->graph_id();
uint32_t rank_id = 0;
auto env_hccl_mode = common::GetEnv("MS_ENABLE_HCCL");
if (!env_hccl_mode.empty() && env_hccl_mode != std::to_string(0)) {
// get actual rank id if hcck is initiated.
auto env_table_file = common::GetEnv("RANK_TABLE_FILE");
auto env_rank_id = common::GetEnv("RANK_ID");
if (!(env_table_file.empty() || env_rank_id.empty())) {
// get actual rank id if it's distribution training case.
if (!CommManager::GetInstance().GetRankID(kHcclWorldGroup, &rank_id)) {
MS_LOG(INFO) << "Failed to get rank id.";
}