forked from mindspore-Ecosystem/mindspore
Add option to dump tensor statistics in csv format
This commit is contained in:
parent
f1f7731fb2
commit
b21c099767
|
@ -41,6 +41,7 @@ if(ENABLE_DEBUGGER)
|
|||
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/tensor_summary.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/debug_services.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/debugger/debugger_utils.cc"
|
||||
"${CMAKE_CURRENT_SOURCE_DIR}/data_dump/tensor_stat_dump.cc"
|
||||
)
|
||||
endif()
|
||||
|
||||
|
|
|
@ -31,6 +31,7 @@ constexpr auto kE2eDumpSettings = "e2e_dump_settings";
|
|||
constexpr auto kDumpMode = "dump_mode";
|
||||
constexpr auto kPath = "path";
|
||||
constexpr auto kNetName = "net_name";
|
||||
constexpr auto kSavedData = "saved_data";
|
||||
constexpr auto kIteration = "iteration";
|
||||
constexpr auto kInputOutput = "input_output";
|
||||
constexpr auto kKernels = "kernels";
|
||||
|
@ -38,6 +39,9 @@ constexpr auto kSupportDevice = "support_device";
|
|||
constexpr auto kEnable = "enable";
|
||||
constexpr auto kOpDebugMode = "op_debug_mode";
|
||||
constexpr auto kTransFlag = "trans_flag";
|
||||
constexpr auto kStatisticDump = "statistic";
|
||||
constexpr auto kTensorDump = "tensor";
|
||||
constexpr auto kFullDump = "full";
|
||||
constexpr auto kDumpInputAndOutput = 0;
|
||||
constexpr auto kDumpInputOnly = 1;
|
||||
constexpr auto kDumpOutputOnly = 2;
|
||||
|
@ -263,6 +267,7 @@ void DumpJsonParser::ParseCommonDumpSetting(const nlohmann::json &content) {
|
|||
ParseDumpMode(*dump_mode);
|
||||
ParseDumpPath(*common_dump_settings); // Pass in the whole json string to parse because the path field is optional.
|
||||
ParseNetName(*net_name);
|
||||
ParseSavedData(*common_dump_settings); // saved data optional
|
||||
ParseIteration(*iteration);
|
||||
ParseInputOutput(*input_output);
|
||||
ParseKernels(*kernels);
|
||||
|
@ -355,6 +360,24 @@ void DumpJsonParser::ParseNetName(const nlohmann::json &content) {
|
|||
}
|
||||
}
|
||||
|
||||
void DumpJsonParser::ParseSavedData(const nlohmann::json &content) {
|
||||
saved_data_ = kTensorDump; // default to tensor data dump
|
||||
auto json_iter = content.find(kSavedData);
|
||||
if (json_iter != content.end()) {
|
||||
CheckJsonStringType(*json_iter, kSavedData);
|
||||
saved_data_ = *json_iter;
|
||||
}
|
||||
if (saved_data_ != kStatisticDump && saved_data_ != kTensorDump && saved_data_ != kFullDump) {
|
||||
MS_LOG(EXCEPTION) << "Dump Json parse failed, saved_data only supports statistic, tensor, or full, but got: "
|
||||
<< saved_data_ << ". Please set saved_data to either statistic, tensor, or full";
|
||||
}
|
||||
auto context = MsContext::GetInstance();
|
||||
if (IsStatisticDump() && context->get_param<std::string>(MS_CTX_DEVICE_TARGET) != kGPUDevice) {
|
||||
MS_LOG(EXCEPTION) << "Dump Json parse failed, storing statistic dump is only supported on GPU device, please set "
|
||||
"saved_data to tensor or use a GPU device";
|
||||
}
|
||||
}
|
||||
|
||||
void DumpJsonParser::ParseIteration(const nlohmann::json &content) {
|
||||
CheckJsonStringType(content, kIteration);
|
||||
auto context = MsContext::GetInstance();
|
||||
|
@ -397,6 +420,12 @@ bool IsIterInRange(uint32_t iteration, const std::string &range) {
|
|||
return (low_range <= iteration) && (iteration <= high_range);
|
||||
}
|
||||
|
||||
bool DumpJsonParser::IsStatisticDump() const { return saved_data_ == kStatisticDump || IsFullDump(); }
|
||||
|
||||
bool DumpJsonParser::IsTensorDump() const { return saved_data_ == kTensorDump || IsFullDump(); }
|
||||
|
||||
bool DumpJsonParser::IsFullDump() const { return saved_data_ == kFullDump; }
|
||||
|
||||
bool DumpJsonParser::IsDumpIter(uint32_t iteration) const {
|
||||
// bool DumpJsonParser::IsDumpIter(uint32_t iteration) --> checks if iteration should be dumped or not.
|
||||
if (iteration_ == "all") {
|
||||
|
|
|
@ -42,6 +42,9 @@ class DumpJsonParser {
|
|||
bool NeedDump(const std::string &op_full_name) const;
|
||||
void MatchKernel(const std::string &kernel_name);
|
||||
void PrintUnusedKernel();
|
||||
bool IsStatisticDump() const;
|
||||
bool IsTensorDump() const;
|
||||
bool IsFullDump() const;
|
||||
bool IsDumpIter(uint32_t iteration) const;
|
||||
bool DumpAllIter();
|
||||
|
||||
|
@ -49,6 +52,7 @@ class DumpJsonParser {
|
|||
bool e2e_dump_enabled() const { return e2e_dump_enabled_; }
|
||||
uint32_t dump_mode() const { return dump_mode_; }
|
||||
std::string path() const { return path_; }
|
||||
std::string saved_data() const { return saved_data_; }
|
||||
std::string iteration_string() const { return iteration_; }
|
||||
std::string net_name() const { return net_name_; }
|
||||
uint32_t op_debug_mode() const { return op_debug_mode_; }
|
||||
|
@ -76,6 +80,7 @@ class DumpJsonParser {
|
|||
uint32_t dump_mode_{0};
|
||||
std::string path_;
|
||||
std::string net_name_;
|
||||
std::string saved_data_;
|
||||
std::string iteration_;
|
||||
uint32_t input_output_{0};
|
||||
std::map<std::string, uint32_t> kernels_;
|
||||
|
@ -97,6 +102,7 @@ class DumpJsonParser {
|
|||
void ParseDumpMode(const nlohmann::json &content);
|
||||
void ParseDumpPath(const nlohmann::json &content);
|
||||
void ParseNetName(const nlohmann::json &content);
|
||||
void ParseSavedData(const nlohmann::json &content);
|
||||
void ParseIteration(const nlohmann::json &content);
|
||||
void ParseInputOutput(const nlohmann::json &content);
|
||||
void ParseKernels(const nlohmann::json &content);
|
||||
|
|
|
@ -29,6 +29,7 @@
|
|||
#include "runtime/device/kernel_runtime_manager.h"
|
||||
#include "utils/config_manager.h"
|
||||
#include "utils/file_utils.h"
|
||||
#include "debug/data_dump/tensor_stat_dump.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/debug_services.h"
|
||||
#include "debug/tensor_load.h"
|
||||
|
@ -117,8 +118,14 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
|
|||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." +
|
||||
std::to_string(j);
|
||||
if (IsDeviceTargetGPU()) {
|
||||
DumpGPUMemToFile(file_path, GetKernelNodeName(node), *addr, int_shapes, type, device_type, trans_flag, j,
|
||||
debugger);
|
||||
if (DumpJsonParser::GetInstance().IsStatisticDump()) {
|
||||
TensorStatDump stat_dump(GetKernelNodeName(node), op_type, op_name, task_id, stream_id, timestamp, false, j);
|
||||
stat_dump.DumpTensorStatsToFile(dump_path, debugger);
|
||||
}
|
||||
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
||||
DumpGPUMemToFile(file_path, GetKernelNodeName(node), *addr, int_shapes, type, device_type, trans_flag, j,
|
||||
debugger);
|
||||
}
|
||||
} else {
|
||||
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
|
||||
}
|
||||
|
@ -196,7 +203,13 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
|
|||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
|
||||
MS_EXCEPTION_IF_NULL(addr);
|
||||
if (IsDeviceTargetGPU()) {
|
||||
DumpGPUMemToFile(file_path, tensor_name, *addr, int_shapes, type, device_type, trans_flag, slot, debugger);
|
||||
if (DumpJsonParser::GetInstance().IsStatisticDump()) {
|
||||
TensorStatDump stat_dump(tensor_name, op_type, op_name, task_id, stream_id, timestamp, true, slot);
|
||||
stat_dump.DumpTensorStatsToFile(dump_path, debugger);
|
||||
}
|
||||
if (DumpJsonParser::GetInstance().IsTensorDump()) {
|
||||
DumpGPUMemToFile(file_path, tensor_name, *addr, int_shapes, type, device_type, trans_flag, slot, debugger);
|
||||
}
|
||||
} else {
|
||||
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
|
||||
}
|
||||
|
@ -242,7 +255,13 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
|
|||
std::string file_path = dump_path + "/Parameter." + dump_name + '.' + std::to_string(task_id) + '.' +
|
||||
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output.0";
|
||||
if (IsDeviceTargetGPU()) {
|
||||
DumpGPUMemToFile(file_path, node_name, *addr, int_shapes, type, device_type, trans_flag, 0, debugger);
|
||||
if (dump_json_parser.IsStatisticDump()) {
|
||||
TensorStatDump stat_dump(node_name, "Parameter", dump_name, task_id, stream_id, timestamp, false, 0);
|
||||
stat_dump.DumpTensorStatsToFile(dump_path, debugger);
|
||||
}
|
||||
if (dump_json_parser.IsTensorDump()) {
|
||||
DumpGPUMemToFile(file_path, node_name, *addr, int_shapes, type, device_type, trans_flag, 0, debugger);
|
||||
}
|
||||
} else {
|
||||
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
|
||||
}
|
||||
|
@ -356,9 +375,15 @@ void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
|
|||
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
||||
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
|
||||
|
||||
if (dump_json_parser.IsStatisticDump()) {
|
||||
CsvWriter::GetInstance().OpenFile(dump_path);
|
||||
}
|
||||
DumpInput(graph, dump_path, debugger);
|
||||
DumpOutput(graph, dump_path, debugger);
|
||||
DumpParametersAndConst(graph, dump_path, debugger);
|
||||
if (dump_json_parser.IsStatisticDump()) {
|
||||
CsvWriter::GetInstance().CloseFile();
|
||||
}
|
||||
success = true;
|
||||
}
|
||||
|
||||
|
|
|
@ -0,0 +1,157 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include "debug/data_dump/tensor_stat_dump.h"
|
||||
|
||||
#include <memory>
|
||||
#include "utils/file_utils.h"
|
||||
#include "debug/common.h"
|
||||
#include "debug/debug_services.h"
|
||||
#include "debug/debugger/debugger.h"
|
||||
|
||||
namespace {
|
||||
constexpr auto kInput = "input";
|
||||
constexpr auto kOutput = "output";
|
||||
constexpr auto kCsvHeader =
|
||||
"Op Type,Op Name,Task ID,Stream ID,Timestamp,IO,Slot,Data Size,Data Type,Shape,Max Value,Min Value,Avg Value,"
|
||||
"Count,Negative Zero Count,Positive Zero Count,NaN Count,Negative Inf Count,Positive Inf Count,Zero Count\n";
|
||||
constexpr auto kCsvFileName = "statistic.csv";
|
||||
} // namespace
|
||||
namespace mindspore {
|
||||
bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
|
||||
if (file_.is_open() && path == file_path_str_) {
|
||||
return true;
|
||||
}
|
||||
if (file_.is_open()) {
|
||||
CloseFile();
|
||||
}
|
||||
bool first_time_opening = file_path_str_ != path;
|
||||
ChangeFileMode(path, S_IWUSR);
|
||||
if (first_time_opening) {
|
||||
// remove any possible output from previous runs
|
||||
file_.open(path, std::ios::out | std::ios::trunc | std::ios::binary);
|
||||
} else {
|
||||
file_.open(path, std::ios::out | std::ios::app | std::ios::binary);
|
||||
}
|
||||
if (!file_.is_open()) {
|
||||
MS_LOG(WARNING) << "Open file " << path << " failed." << ErrnoToString(errno);
|
||||
return false;
|
||||
}
|
||||
if (first_time_opening) {
|
||||
file_ << header;
|
||||
file_.flush();
|
||||
file_path_str_ = path;
|
||||
}
|
||||
MS_LOG(INFO) << "Opened statistics file: " << path;
|
||||
return true;
|
||||
}
|
||||
|
||||
void CsvWriter::CloseFile() {
|
||||
if (file_.is_open()) {
|
||||
file_.close();
|
||||
ChangeFileMode(file_path_str_, S_IRUSR);
|
||||
MS_LOG(INFO) << "Closed statistics dump file: " << file_path_str_;
|
||||
}
|
||||
}
|
||||
|
||||
template <typename T>
|
||||
void CsvWriter::WriteToCsv(const T &val, bool end_line) {
|
||||
file_ << val;
|
||||
if (end_line) {
|
||||
file_ << kEndLine;
|
||||
file_.flush();
|
||||
} else {
|
||||
file_ << kSeparator;
|
||||
}
|
||||
}
|
||||
|
||||
CsvWriter::~CsvWriter() { CloseFile(); }
|
||||
|
||||
TensorStatDump::TensorStatDump(const std::string &original_kernel_name, const std::string &op_type,
|
||||
const std::string &op_name, uint32_t task_id, uint32_t stream_id, uint64_t timestamp,
|
||||
bool input, size_t slot)
|
||||
: original_kernel_name_{original_kernel_name},
|
||||
op_type_{op_type},
|
||||
op_name_{op_name},
|
||||
task_id_{task_id},
|
||||
stream_id_{stream_id},
|
||||
timestamp_{timestamp},
|
||||
slot_{slot} {
|
||||
if (input) {
|
||||
io_ = kInput;
|
||||
} else {
|
||||
io_ = kOutput;
|
||||
}
|
||||
}
|
||||
|
||||
void TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, const Debugger *debugger) {
|
||||
std::string filename = dump_path + "/" + kCsvFileName;
|
||||
auto file_path = Common::CreatePrefixPath(filename);
|
||||
if (!file_path.has_value()) {
|
||||
MS_LOG(WARNING) << "CreatePrefixPath failed.";
|
||||
return;
|
||||
}
|
||||
// try to open file
|
||||
CsvWriter &csv = CsvWriter::GetInstance();
|
||||
std::string file_path_value = file_path.value();
|
||||
int retry = 2;
|
||||
while (retry > 0) {
|
||||
if (csv.OpenFile(file_path_value, kCsvHeader)) {
|
||||
break;
|
||||
}
|
||||
retry--;
|
||||
}
|
||||
if (!retry) {
|
||||
MS_LOG(WARNING) << "Open statistic dump file failed, skipping current statistics";
|
||||
return;
|
||||
}
|
||||
// get tensor statistics using debugger
|
||||
std::string tensor_loader_name = original_kernel_name_ + ":" + std::to_string(slot_);
|
||||
std::shared_ptr<TensorData> data = debugger->GetTensor(tensor_loader_name);
|
||||
if (data == nullptr) {
|
||||
MS_LOG(WARNING) << "Failed to find tensor in tensor loader, skipping current statistics";
|
||||
return;
|
||||
}
|
||||
const DebugServices::TensorStat &stat = debugger->GetTensorStatistics(data);
|
||||
// write tensor statistics to csv file
|
||||
std::ostringstream shape;
|
||||
shape << "\"(";
|
||||
for (size_t i = 0; i < stat.shape.size(); i++) {
|
||||
shape << (i ? "," : "") << stat.shape[i];
|
||||
}
|
||||
shape << ")\"";
|
||||
csv.WriteToCsv(op_type_);
|
||||
csv.WriteToCsv(op_name_);
|
||||
csv.WriteToCsv(task_id_);
|
||||
csv.WriteToCsv(stream_id_);
|
||||
csv.WriteToCsv(timestamp_);
|
||||
csv.WriteToCsv(io_);
|
||||
csv.WriteToCsv(slot_);
|
||||
csv.WriteToCsv(stat.data_size);
|
||||
csv.WriteToCsv(stat.dtype);
|
||||
csv.WriteToCsv(shape.str());
|
||||
csv.WriteToCsv(stat.max_value);
|
||||
csv.WriteToCsv(stat.min_value);
|
||||
csv.WriteToCsv(stat.avg_value);
|
||||
csv.WriteToCsv(stat.count);
|
||||
csv.WriteToCsv(stat.neg_zero_count);
|
||||
csv.WriteToCsv(stat.pos_zero_count);
|
||||
csv.WriteToCsv(stat.nan_count);
|
||||
csv.WriteToCsv(stat.neg_inf_count);
|
||||
csv.WriteToCsv(stat.pos_inf_count);
|
||||
csv.WriteToCsv(stat.zero_count, true);
|
||||
}
|
||||
} // namespace mindspore
|
|
@ -0,0 +1,69 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
||||
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
||||
|
||||
#include <string>
|
||||
#include <fstream>
|
||||
|
||||
#include "utils/ms_utils.h"
|
||||
|
||||
namespace mindspore {
|
||||
class Debugger;
|
||||
class CsvWriter {
|
||||
public:
|
||||
static CsvWriter &GetInstance() {
|
||||
static CsvWriter instance;
|
||||
return instance;
|
||||
}
|
||||
|
||||
private:
|
||||
const std::string kSeparator = ",";
|
||||
const std::string kEndLine = "\n";
|
||||
std::ofstream file_;
|
||||
std::string file_path_str_ = "";
|
||||
|
||||
public:
|
||||
CsvWriter() = default;
|
||||
~CsvWriter();
|
||||
DISABLE_COPY_AND_ASSIGN(CsvWriter)
|
||||
bool OpenFile(const std::string &path, const std::string &header = "");
|
||||
void CloseFile();
|
||||
template <typename T>
|
||||
void WriteToCsv(const T &val, bool end_line = false);
|
||||
};
|
||||
|
||||
class TensorStatDump {
|
||||
static const char CSV_HEADER[];
|
||||
static const char CSV_FILE_NAME[];
|
||||
|
||||
const std::string &original_kernel_name_;
|
||||
const std::string &op_type_;
|
||||
const std::string &op_name_;
|
||||
uint32_t task_id_;
|
||||
uint32_t stream_id_;
|
||||
uint64_t timestamp_;
|
||||
std::string io_;
|
||||
size_t slot_;
|
||||
|
||||
public:
|
||||
TensorStatDump(const std::string &original_kernel_name, const std::string &op_type, const std::string &op_name,
|
||||
uint32_t task_id, uint32_t stream_id, uint64_t timestamp, bool input, size_t slot);
|
||||
void DumpTensorStatsToFile(const std::string &dump_path, const Debugger *debugger);
|
||||
};
|
||||
} // namespace mindspore
|
||||
#endif // MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_TENSOR_STAT_DUMP_H_
|
|
@ -335,6 +335,11 @@ void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tenso
|
|||
}
|
||||
#endif
|
||||
|
||||
void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_found) {
|
||||
if (history_not_found) {
|
||||
*error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
|
||||
}
|
||||
}
|
||||
void DebugServices::CheckWatchpointsForTensor(
|
||||
partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
|
||||
partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
|
||||
|
@ -422,9 +427,7 @@ void DebugServices::CheckWatchpointsForTensor(
|
|||
is_hit = std::get<ITensorSummary::eHitPos>(item);
|
||||
error_code = std::get<ITensorSummary::eErrorCodePos>(item);
|
||||
#ifdef OFFLINE_DBG_MODE
|
||||
if (history_not_found) {
|
||||
error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
|
||||
}
|
||||
CheckHistoryErrorCode(&error_code, history_not_found);
|
||||
#endif
|
||||
parameter_list = std::get<ITensorSummary::eParamListPos>(item);
|
||||
}
|
||||
|
@ -1414,6 +1417,10 @@ bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNode
|
|||
|
||||
std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
|
||||
|
||||
std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_name) const {
|
||||
return tensor_loader_->GetTensor(tensor_name);
|
||||
}
|
||||
|
||||
void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
|
||||
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
|
|
|
@ -236,7 +236,7 @@ class DebugServices {
|
|||
int zero_count = 0;
|
||||
};
|
||||
|
||||
TensorStat GetTensorStatistics(const std::shared_ptr<TensorData> &tensor);
|
||||
static TensorStat GetTensorStatistics(const std::shared_ptr<TensorData> &tensor);
|
||||
|
||||
void AddWatchpoint(
|
||||
unsigned int id, unsigned int watch_condition, float parameter,
|
||||
|
@ -260,6 +260,8 @@ class DebugServices {
|
|||
const std::vector<parameter_t> ¶meter_list);
|
||||
#endif
|
||||
|
||||
void CheckHistoryErrorCode(int *error_code, bool history_not_found);
|
||||
|
||||
void CheckWatchpointsForTensor(partitioned_names *chunk_names, partitioned_names *chunk_slots,
|
||||
partitioned_numbers *chunk_conditions, partitioned_id *const chunk_watchpoint_id,
|
||||
partitioned_parameters *chunk_parameters, partitioned_error_code *chunk_error_codes,
|
||||
|
@ -413,6 +415,8 @@ class DebugServices {
|
|||
|
||||
std::vector<std::shared_ptr<TensorData>> GetTensor() const;
|
||||
|
||||
std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const;
|
||||
|
||||
void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name);
|
||||
|
||||
void EmptyCurrentTensor();
|
||||
|
|
|
@ -1093,6 +1093,14 @@ std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto
|
|||
return tensor_summary_list;
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorData> Debugger::GetTensor(const std::string &tensor_name) const {
|
||||
return debug_services_->GetTensor(tensor_name);
|
||||
}
|
||||
|
||||
DebugServices::TensorStat Debugger::GetTensorStatistics(std::shared_ptr<TensorData> tensor_data) const {
|
||||
return DebugServices::GetTensorStatistics(tensor_data);
|
||||
}
|
||||
|
||||
void Debugger::Exit(bool exit_success) {
|
||||
// debugger will notify main thread to exit because main thread can only exit at step boundary.
|
||||
MS_LOG(INFO) << "Exit Debugger";
|
||||
|
|
|
@ -108,6 +108,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
|
||||
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
|
||||
|
||||
std::shared_ptr<TensorData> GetTensor(const std::string &tensor_name) const;
|
||||
|
||||
DebugServices::TensorStat GetTensorStatistics(std::shared_ptr<TensorData> tensor_data) const;
|
||||
|
||||
bool debugger_enabled() const;
|
||||
|
||||
bool partial_memory() const;
|
||||
|
|
|
@ -140,6 +140,19 @@ def generate_dump_json_with_overflow(dump_path, json_file_name, test_key, op):
|
|||
with open(json_file_name, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
def generate_statistic_dump_json(dump_path, json_file_name, test_key, saved_data):
|
||||
"""
|
||||
Util function to generate dump configuration json file for statistic dump.
|
||||
"""
|
||||
if test_key == "test_gpu_e2e_dump":
|
||||
data = e2e_dump_dict
|
||||
data["common_dump_settings"]["path"] = dump_path
|
||||
data["common_dump_settings"]["saved_data"] = saved_data
|
||||
else:
|
||||
raise ValueError(
|
||||
"Failed to generate statistic dump json file. The test name value " + test_key + " is invalid.")
|
||||
with open(json_file_name, 'w') as f:
|
||||
json.dump(data, f)
|
||||
|
||||
def check_dump_structure(dump_path, json_file_path, num_card, num_graph, num_iteration):
|
||||
"""
|
||||
|
|
|
@ -18,6 +18,7 @@ import tempfile
|
|||
import time
|
||||
import shutil
|
||||
import glob
|
||||
import csv
|
||||
from importlib import import_module
|
||||
from pathlib import Path
|
||||
import numpy as np
|
||||
|
@ -33,7 +34,7 @@ from mindspore.nn import Momentum
|
|||
from mindspore.nn import TrainOneStepCell
|
||||
from mindspore.nn import WithLossCell
|
||||
from dump_test_utils import generate_dump_json, generate_dump_json_with_overflow, \
|
||||
check_dump_structure, find_nth_pos
|
||||
generate_statistic_dump_json, check_dump_structure, find_nth_pos
|
||||
from tests.security_utils import security_off_wrap
|
||||
|
||||
|
||||
|
@ -392,3 +393,101 @@ def test_ascend_not_overflow_dump():
|
|||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target='Ascend')
|
||||
run_not_overflow_dump()
|
||||
|
||||
def check_statistic_dump(dump_file_path):
|
||||
output_name = "statistic.csv"
|
||||
output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
|
||||
real_path = os.path.realpath(output_path)
|
||||
with open(real_path) as f:
|
||||
reader = csv.DictReader(f)
|
||||
input1 = next(reader)
|
||||
assert input1['IO'] == 'input'
|
||||
assert input1['Min Value'] == '1'
|
||||
assert input1['Max Value'] == '6'
|
||||
input2 = next(reader)
|
||||
assert input2['IO'] == 'input'
|
||||
assert input2['Min Value'] == '7'
|
||||
assert input2['Max Value'] == '12'
|
||||
output = next(reader)
|
||||
assert output['IO'] == 'output'
|
||||
assert output['Min Value'] == '8'
|
||||
assert output['Max Value'] == '18'
|
||||
|
||||
def check_data_dump(dump_file_path):
|
||||
output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
|
||||
output_path = glob.glob(os.path.join(dump_file_path, output_name))[0]
|
||||
real_path = os.path.realpath(output_path)
|
||||
output = np.load(real_path)
|
||||
expect = np.array([[8, 10, 12], [14, 16, 18]], np.float32)
|
||||
assert np.array_equal(output, expect)
|
||||
|
||||
def run_gpu_e2e_dump(saved_data):
|
||||
"""Run gpu e2e dump"""
|
||||
if sys.platform != 'linux':
|
||||
return
|
||||
pwd = os.getcwd()
|
||||
with tempfile.TemporaryDirectory(dir=pwd) as tmp_dir:
|
||||
dump_path = os.path.join(tmp_dir, 'gpu_e2e_dump')
|
||||
dump_config_path = os.path.join(tmp_dir, 'gpu_e2e_dump.json')
|
||||
generate_statistic_dump_json(dump_path, dump_config_path, 'test_gpu_e2e_dump', saved_data)
|
||||
os.environ['MINDSPORE_DUMP_CONFIG'] = dump_config_path
|
||||
dump_file_path = os.path.join(dump_path, 'rank_0', 'Net', '0', '0')
|
||||
if os.path.isdir(dump_path):
|
||||
shutil.rmtree(dump_path)
|
||||
add = Net()
|
||||
add(Tensor(x), Tensor(y))
|
||||
for _ in range(3):
|
||||
if not os.path.exists(dump_file_path):
|
||||
time.sleep(2)
|
||||
check_dump_structure(dump_path, dump_config_path, 1, 1, 1)
|
||||
if saved_data in ('statistic', 'full'):
|
||||
check_statistic_dump(dump_file_path)
|
||||
if saved_data in ('tensor', 'full'):
|
||||
check_data_dump(dump_file_path)
|
||||
if saved_data == 'statistic':
|
||||
# assert only file is statistic.csv, tensor data is not saved
|
||||
assert len(os.listdir(dump_file_path)) == 1
|
||||
elif saved_data == 'tensor':
|
||||
# assert only tensor data is saved, not statistics
|
||||
stat_path = os.path.join(dump_file_path, 'statistic.csv')
|
||||
assert not os.path.isfile(stat_path)
|
||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_gpu_e2e_statistic_dump():
|
||||
"""
|
||||
Feature: GPU Statistics Dump
|
||||
Description: Test GPU statistics dump
|
||||
Expectation: Statistics are stored in statistic.csv files
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
|
||||
run_gpu_e2e_dump('statistic')
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_gpu_e2e_tensor_dump():
|
||||
"""
|
||||
Feature: GPU Tensor Dump
|
||||
Description: Test GPU tensor dump
|
||||
Expectation: Tensor data are stored in npy files
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
|
||||
run_gpu_e2e_dump('tensor')
|
||||
|
||||
@pytest.mark.level0
|
||||
@pytest.mark.platform_x86_gpu_training
|
||||
@pytest.mark.env_onecard
|
||||
@security_off_wrap
|
||||
def test_gpu_e2e_full_dump():
|
||||
"""
|
||||
Feature: GPU Full Dump
|
||||
Description: Test GPU full dump
|
||||
Expectation: Tensor are stored in npy files and their statistics stored in statistic.csv
|
||||
"""
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target="GPU")
|
||||
run_gpu_e2e_dump('full')
|
||||
|
|
Loading…
Reference in New Issue