forked from mindspore-Ecosystem/mindspore
Refactor original kernal name so all output tensors are included in stat
dump
This commit is contained in:
parent
0b236f91b9
commit
2a4a98ce9d
|
@ -176,8 +176,8 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
|
||||||
std::to_string(j);
|
std::to_string(j);
|
||||||
if (IsDeviceTargetGPU()) {
|
if (IsDeviceTargetGPU()) {
|
||||||
if (DumpJsonParser::GetInstance().IsStatisticDump()) {
|
if (DumpJsonParser::GetInstance().IsStatisticDump()) {
|
||||||
TensorStatDump stat_dump(GetKernelNodeName(node), op_type, op_name, task_id, stream_id, timestamp, false, j);
|
TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, false, j, j);
|
||||||
stat_dump.DumpTensorStatsToFile(dump_path, debugger);
|
stat_dump.DumpTensorStatsToFile(GetKernelNodeName(node), dump_path, debugger);
|
||||||
}
|
}
|
||||||
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
||||||
DumpGPUMemToFile(file_path, GetKernelNodeName(node), *addr, int_shapes, type, device_type, trans_flag, j,
|
DumpGPUMemToFile(file_path, GetKernelNodeName(node), *addr, int_shapes, type, device_type, trans_flag, j,
|
||||||
|
@ -261,8 +261,8 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
|
||||||
MS_EXCEPTION_IF_NULL(addr);
|
MS_EXCEPTION_IF_NULL(addr);
|
||||||
if (IsDeviceTargetGPU()) {
|
if (IsDeviceTargetGPU()) {
|
||||||
if (DumpJsonParser::GetInstance().IsStatisticDump()) {
|
if (DumpJsonParser::GetInstance().IsStatisticDump()) {
|
||||||
TensorStatDump stat_dump(tensor_name, op_type, op_name, task_id, stream_id, timestamp, true, slot);
|
TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, true, j, slot);
|
||||||
stat_dump.DumpTensorStatsToFile(dump_path, debugger);
|
stat_dump.DumpTensorStatsToFile(tensor_name, dump_path, debugger);
|
||||||
}
|
}
|
||||||
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
||||||
DumpGPUMemToFile(file_path, tensor_name, *addr, int_shapes, type, device_type, trans_flag, slot, debugger);
|
DumpGPUMemToFile(file_path, tensor_name, *addr, int_shapes, type, device_type, trans_flag, slot, debugger);
|
||||||
|
@ -313,8 +313,8 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
|
||||||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output.0";
|
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output.0";
|
||||||
if (IsDeviceTargetGPU()) {
|
if (IsDeviceTargetGPU()) {
|
||||||
if (dump_json_parser.IsStatisticDump()) {
|
if (dump_json_parser.IsStatisticDump()) {
|
||||||
TensorStatDump stat_dump(node_name, "Parameter", dump_name, task_id, stream_id, timestamp, false, 0);
|
TensorStatDump stat_dump("Parameter", dump_name, task_id, stream_id, timestamp, false, 0, 0);
|
||||||
stat_dump.DumpTensorStatsToFile(dump_path, debugger);
|
stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
|
||||||
}
|
}
|
||||||
if (dump_json_parser.IsTensorDump()) {
|
if (dump_json_parser.IsTensorDump()) {
|
||||||
DumpGPUMemToFile(file_path, node_name, *addr, int_shapes, type, device_type, trans_flag, 0, debugger);
|
DumpGPUMemToFile(file_path, node_name, *addr, int_shapes, type, device_type, trans_flag, 0, debugger);
|
||||||
|
@ -433,7 +433,7 @@ void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
|
||||||
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
|
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
|
||||||
|
|
||||||
if (dump_json_parser.IsStatisticDump()) {
|
if (dump_json_parser.IsStatisticDump()) {
|
||||||
CsvWriter::GetInstance().OpenFile(dump_path);
|
TensorStatDump::OpenStatisticsFile(dump_path);
|
||||||
}
|
}
|
||||||
DumpInput(graph, dump_path, debugger);
|
DumpInput(graph, dump_path, debugger);
|
||||||
DumpOutput(graph, dump_path, debugger);
|
DumpOutput(graph, dump_path, debugger);
|
||||||
|
|
|
@ -30,6 +30,7 @@ constexpr auto kCsvHeader =
|
||||||
"Count,Negative Zero Count,Positive Zero Count,NaN Count,Negative Inf Count,Positive Inf Count,Zero Count\n";
|
"Count,Negative Zero Count,Positive Zero Count,NaN Count,Negative Inf Count,Positive Inf Count,Zero Count\n";
|
||||||
constexpr auto kCsvFileName = "statistic.csv";
|
constexpr auto kCsvFileName = "statistic.csv";
|
||||||
} // namespace
|
} // namespace
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
||||||
if (file_.is_open() && path == file_path_str_) {
|
if (file_.is_open() && path == file_path_str_) {
|
||||||
|
@ -38,13 +39,20 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
||||||
if (file_.is_open()) {
|
if (file_.is_open()) {
|
||||||
CloseFile();
|
CloseFile();
|
||||||
}
|
}
|
||||||
|
auto file_path = Common::CreatePrefixPath(path);
|
||||||
|
if (!file_path.has_value()) {
|
||||||
|
MS_LOG(WARNING) << "CreatePrefixPath failed.";
|
||||||
|
return false;
|
||||||
|
}
|
||||||
|
// try to open file
|
||||||
|
std::string file_path_value = file_path.value();
|
||||||
bool first_time_opening = file_path_str_ != path;
|
bool first_time_opening = file_path_str_ != path;
|
||||||
ChangeFileMode(path, S_IWUSR);
|
ChangeFileMode(file_path_value, S_IWUSR);
|
||||||
if (first_time_opening) {
|
if (first_time_opening) {
|
||||||
// remove any possible output from previous runs
|
// remove any possible output from previous runs
|
||||||
file_.open(path, std::ios::out | std::ios::trunc | std::ios::binary);
|
file_.open(file_path_value, std::ios::out | std::ios::trunc | std::ios::binary);
|
||||||
} else {
|
} else {
|
||||||
file_.open(path, std::ios::out | std::ios::app | std::ios::binary);
|
file_.open(file_path_value, std::ios::out | std::ios::app | std::ios::binary);
|
||||||
}
|
}
|
||||||
if (!file_.is_open()) {
|
if (!file_.is_open()) {
|
||||||
MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno);
|
MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno);
|
||||||
|
@ -55,7 +63,7 @@ bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
||||||
file_.flush();
|
file_.flush();
|
||||||
file_path_str_ = path;
|
file_path_str_ = path;
|
||||||
}
|
}
|
||||||
MS_LOG(INFO) << "Opened statistics file: " << path;
|
MS_LOG(INFO) << "Opened file: " << path;
|
||||||
return true;
|
return true;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -80,16 +88,16 @@ void CsvWriter::WriteToCsv(const T &val, bool end_line) {
|
||||||
|
|
||||||
CsvWriter::~CsvWriter() { CloseFile(); }
|
CsvWriter::~CsvWriter() { CloseFile(); }
|
||||||
|
|
||||||
TensorStatDump::TensorStatDump(const std::string &original_kernel_name, const std::string &op_type,
|
TensorStatDump::TensorStatDump(const std::string &op_type, const std::string &op_name, uint32_t task_id,
|
||||||
const std::string &op_name, uint32_t task_id, uint32_t stream_id, uint64_t timestamp,
|
uint32_t stream_id, uint64_t timestamp, bool input, size_t slot,
|
||||||
bool input, size_t slot)
|
size_t tensor_loader_slot)
|
||||||
: original_kernel_name_{original_kernel_name},
|
: op_type_{op_type},
|
||||||
op_type_{op_type},
|
|
||||||
op_name_{op_name},
|
op_name_{op_name},
|
||||||
task_id_{task_id},
|
task_id_{task_id},
|
||||||
stream_id_{stream_id},
|
stream_id_{stream_id},
|
||||||
timestamp_{timestamp},
|
timestamp_{timestamp},
|
||||||
slot_{slot} {
|
slot_{slot},
|
||||||
|
tensor_loader_slot_{tensor_loader_slot} {
|
||||||
if (input) {
|
if (input) {
|
||||||
io_ = kInput;
|
io_ = kInput;
|
||||||
} else {
|
} else {
|
||||||
|
@ -97,35 +105,37 @@ TensorStatDump::TensorStatDump(const std::string &original_kernel_name, const st
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, const Debugger *debugger) {
|
bool TensorStatDump::OpenStatisticsFile(const std::string &dump_path) {
|
||||||
std::string filename = dump_path + "/" + kCsvFileName;
|
std::string filename = dump_path + "/" + kCsvFileName;
|
||||||
auto file_path = Common::CreatePrefixPath(filename);
|
|
||||||
if (!file_path.has_value()) {
|
|
||||||
MS_LOG(WARNING) << "CreatePrefixPath failed.";
|
|
||||||
return;
|
|
||||||
}
|
|
||||||
// try to open file
|
// try to open file
|
||||||
CsvWriter &csv = CsvWriter::GetInstance();
|
CsvWriter &csv = CsvWriter::GetInstance();
|
||||||
std::string file_path_value = file_path.value();
|
|
||||||
int retry = 2;
|
int retry = 2;
|
||||||
while (retry > 0) {
|
while (retry > 0) {
|
||||||
if (csv.OpenFile(file_path_value, kCsvHeader)) {
|
if (csv.OpenFile(filename, kCsvHeader)) {
|
||||||
break;
|
break;
|
||||||
}
|
}
|
||||||
retry--;
|
retry--;
|
||||||
}
|
}
|
||||||
if (!retry) {
|
if (!retry) {
|
||||||
MS_LOG(WARNING) << "Open statistic dump file failed, skipping current statistics";
|
MS_LOG(WARNING) << "Open statistic dump file failed, skipping current statistics";
|
||||||
return;
|
return false;
|
||||||
|
}
|
||||||
|
return true;
|
||||||
|
}
|
||||||
|
|
||||||
|
bool TensorStatDump::DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
|
||||||
|
const Debugger *debugger) {
|
||||||
|
if (!OpenStatisticsFile(dump_path)) {
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
// get tensor statistics using debugger
|
// get tensor statistics using debugger
|
||||||
std::string tensor_loader_name = original_kernel_name_ + ":" + std::to_string(slot_);
|
std::string tensor_loader_name = original_kernel_name + ":" + std::to_string(tensor_loader_slot_);
|
||||||
std::shared_ptr<TensorData> data = debugger->GetTensor(tensor_loader_name);
|
std::shared_ptr<TensorData> data = debugger->GetTensor(tensor_loader_name);
|
||||||
if (data == nullptr) {
|
if (data == nullptr) {
|
||||||
MS_LOG(WARNING) << "Failed to find tensor in tensor loader, skipping current statistics";
|
MS_LOG(WARNING) << "Failed to find " << tensor_loader_name << " in tensor loader, skipping current statistics";
|
||||||
return;
|
return false;
|
||||||
}
|
}
|
||||||
const DebugServices::TensorStat &stat = debugger->GetTensorStatistics(data);
|
const DebugServices::TensorStat &stat = DebugServices::GetTensorStatistics(data);
|
||||||
// write tensor statistics to csv file
|
// write tensor statistics to csv file
|
||||||
std::ostringstream shape;
|
std::ostringstream shape;
|
||||||
shape << "\"(";
|
shape << "\"(";
|
||||||
|
@ -133,6 +143,7 @@ void TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, const D
|
||||||
shape << (i ? "," : "") << stat.shape[i];
|
shape << (i ? "," : "") << stat.shape[i];
|
||||||
}
|
}
|
||||||
shape << ")\"";
|
shape << ")\"";
|
||||||
|
CsvWriter &csv = CsvWriter::GetInstance();
|
||||||
csv.WriteToCsv(op_type_);
|
csv.WriteToCsv(op_type_);
|
||||||
csv.WriteToCsv(op_name_);
|
csv.WriteToCsv(op_name_);
|
||||||
csv.WriteToCsv(task_id_);
|
csv.WriteToCsv(task_id_);
|
||||||
|
@ -153,5 +164,6 @@ void TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, const D
|
||||||
csv.WriteToCsv(stat.neg_inf_count);
|
csv.WriteToCsv(stat.neg_inf_count);
|
||||||
csv.WriteToCsv(stat.pos_inf_count);
|
csv.WriteToCsv(stat.pos_inf_count);
|
||||||
csv.WriteToCsv(stat.zero_count, true);
|
csv.WriteToCsv(stat.zero_count, true);
|
||||||
|
return true;
|
||||||
}
|
}
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
|
|
|
@ -51,7 +51,6 @@ class TensorStatDump {
|
||||||
static const char CSV_HEADER[];
|
static const char CSV_HEADER[];
|
||||||
static const char CSV_FILE_NAME[];
|
static const char CSV_FILE_NAME[];
|
||||||
|
|
||||||
const std::string &original_kernel_name_;
|
|
||||||
const std::string &op_type_;
|
const std::string &op_type_;
|
||||||
const std::string &op_name_;
|
const std::string &op_name_;
|
||||||
uint32_t task_id_;
|
uint32_t task_id_;
|
||||||
|
@ -59,11 +58,15 @@ class TensorStatDump {
|
||||||
uint64_t timestamp_;
|
uint64_t timestamp_;
|
||||||
std::string io_;
|
std::string io_;
|
||||||
size_t slot_;
|
size_t slot_;
|
||||||
|
size_t tensor_loader_slot_;
|
||||||
|
|
||||||
public:
|
public:
|
||||||
TensorStatDump(const std::string &original_kernel_name, const std::string &op_type, const std::string &op_name,
|
static bool OpenStatisticsFile(const std::string &dump_path);
|
||||||
uint32_t task_id, uint32_t stream_id, uint64_t timestamp, bool input, size_t slot);
|
|
||||||
void DumpTensorStatsToFile(const std::string &dump_path, const Debugger *debugger);
|
TensorStatDump(const std::string &op_type, const std::string &op_name, uint32_t task_id, uint32_t stream_id,
|
||||||
|
uint64_t timestamp, bool input, size_t slot, size_t tensor_loader_slot_);
|
||||||
|
bool DumpTensorStatsToFile(const std::string &original_kernel_name, const std::string &dump_path,
|
||||||
|
const Debugger *debugger);
|
||||||
};
|
};
|
||||||
} // namespace mindspore
|
} // namespace mindspore
|
||||||
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
||||||
|
|
|
@ -1140,7 +1140,7 @@ std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
// tensor was found creating tensor summary object.
|
// tensor was found creating tensor summary object.
|
||||||
DebugServices::TensorStat tensor_stat = debug_services_->GetTensorStatistics(tensor);
|
DebugServices::TensorStat tensor_stat = DebugServices::GetTensorStatistics(tensor);
|
||||||
AddTensorStatInfo(tensor_stat, &tensor_summary_list);
|
AddTensorStatInfo(tensor_stat, &tensor_summary_list);
|
||||||
}
|
}
|
||||||
return tensor_summary_list;
|
return tensor_summary_list;
|
||||||
|
@ -1150,10 +1150,6 @@ std::shared_ptr<TensorData> Debugger::GetTensor(const std::string &tensor_name)
|
||||||
return debug_services_->GetTensor(tensor_name);
|
return debug_services_->GetTensor(tensor_name);
|
||||||
}
|
}
|
||||||
|
|
||||||
DebugServices::TensorStat Debugger::GetTensorStatistics(std::shared_ptr<TensorData> tensor_data) const {
|
|
||||||
return DebugServices::GetTensorStatistics(tensor_data);
|
|
||||||
}
|
|
||||||
|
|
||||||
void Debugger::Exit(bool exit_success) {
|
void Debugger::Exit(bool exit_success) {
|
||||||
// debugger will notify main thread to exit because main thread can only exit at step boundary.
|
// debugger will notify main thread to exit because main thread can only exit at step boundary.
|
||||||
MS_LOG(INFO) << "Exit Debugger";
|
MS_LOG(INFO) << "Exit Debugger";
|
||||||
|
|
|
@ -117,8 +117,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
|
|
||||||
std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const;
|
std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const;
|
||||||
|
|
||||||
DebugServices::TensorStat GetTensorStatistics(std::shared_ptr<TensorData> tensor_data) const;
|
|
||||||
|
|
||||||
bool debugger_enabled() const;
|
bool debugger_enabled() const;
|
||||||
|
|
||||||
bool partial_memory() const;
|
bool partial_memory() const;
|
||||||
|
|
|
@ -343,7 +343,7 @@ std::vector<TensorStatData> DbgServices::ReadTensorsStat(const std::vector<tenso
|
||||||
AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
|
AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
|
||||||
continue;
|
continue;
|
||||||
}
|
}
|
||||||
DebugServices::TensorStat tensor_statistics = debug_services_->GetTensorStatistics(result);
|
DebugServices::TensorStat tensor_statistics = DebugServices::GetTensorStatistics(result);
|
||||||
AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
|
AddTensorStatInfo(tensor_statistics, &tensors_read_stat);
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue