!29062 Add comments for dump and debugger code, remove unused functions

Merge pull request !29062 from parastooashtari/debugger_marker
This commit is contained in:
i-robot 2022-01-19 16:22:44 +00:00 committed by Gitee
commit 180b101ad5
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
19 changed files with 791 additions and 19 deletions

View File

@ -608,6 +608,7 @@ void AscendSession::CompileChildGraph(const KernelGraphPtr &child_graph) {
bool AscendSession::IsSupportSummary() { return !device::KernelAdjust::NeedLoopSink(); }
// Ascend old runtime.
void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *const) {
#ifdef ENABLE_DEBUGGER
@ -625,6 +626,7 @@ void AscendSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_g
#endif
}
// Ascend old runtime.
void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &, VectorRef *const) {
// summary
@ -1659,6 +1661,7 @@ void AscendSession::HardwareOptimize(NotNull<KernelGraphPtr> graph,
}
#ifdef ENABLE_DEBUGGER
// Load graphs and their children for Ascend old runtime.
void AscendSession::LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,
NotNull<std::set<KernelGraphPtr> *> const memo) const {
if (memo->find(graph) != memo->end()) {

View File

@ -126,7 +126,7 @@ void GPUSession::Init(uint32_t device_id) {
}
#ifndef ENABLE_SECURITY
auto &json_parser = DumpJsonParser::GetInstance();
// Dump json config file if dump is enabled
// Dump json config file if dump is enabled for GPU old runtime.
json_parser.CopyDumpJsonToDir(rank_id_);
json_parser.CopyMSCfgJsonToDir(rank_id_);
#endif
@ -413,7 +413,7 @@ GraphId GPUSession::CompileGraphImpl(NotNull<FuncGraphPtr> func_graph) {
GraphId GPUSession::CompileGraphImpl(const KernelGraphPtr &graph) {
MS_EXCEPTION_IF_NULL(graph);
// Prepare ms context info for dump .pb graph
// Prepare ms context info for dump .pb graph for GPU old runtime.
auto context_ptr = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context_ptr);
auto runtime_instance = device::KernelRuntimeManager::Instance().GetSingleKernelRuntime(kGPUDevice, device_id_);
@ -471,6 +471,7 @@ GraphId GPUSession::CompileGraphImpl(const KernelGraphPtr &graph) {
}
#endif
#ifndef ENABLE_SECURITY
// GPU old runtime.
if (json_parser.e2e_dump_enabled()) {
graph->set_root_graph_id(graph->graph_id());
std::string final_graph = "trace_code_graph_" + std::to_string(graph->graph_id());
@ -509,6 +510,7 @@ GraphId GPUSession::CompileGraphImpl(const KernelGraphPtr &graph) {
return graph->graph_id();
}
// GPU old runtime.
void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
#ifdef ENABLE_DEBUGGER
@ -525,6 +527,7 @@ void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_grap
#endif
}
// GPU old runtime.
void GPUSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_graph,
const std::vector<tensor::TensorPtr> &inputs, VectorRef *outputs) {
// Summary
@ -730,6 +733,7 @@ void GPUSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) con
}
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
// Dump graph and graph history file if e2e_dump is enabled and update cur_dump_iter for GPU old runtime.
if (debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
E2eDump::DumpRunIter(kernel_graph, rank_id_);

View File

@ -91,6 +91,12 @@ bool DumpJsonParser::IsDumpEnabled() {
return true;
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Parse the configuration option in dump json file pointed by environment variable MINDSPORE_DUMP_CONFIG.
*/
void DumpJsonParser::Parse() {
std::lock_guard<std::mutex> guard(lock_);
if (already_parsed_) {
@ -144,6 +150,12 @@ void WriteJsonFile(const std::string &file_path, const std::ifstream &json_file)
ChangeFileMode(file_path, S_IRUSR);
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Copy the dump configuration file to the root directory of dump path.
*/
void DumpJsonParser::CopyDumpJsonToDir(uint32_t rank_id) {
this->Parse();
if (!IsDumpEnabled()) {
@ -165,6 +177,12 @@ void DumpJsonParser::CopyDumpJsonToDir(uint32_t rank_id) {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Copy the hccl configuration file to the root directory of dump path.
*/
void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
if (!IsDumpEnabled()) {
return;
@ -186,6 +204,13 @@ void DumpJsonParser::CopyHcclJsonToDir(uint32_t rank_id) {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Copy the mindspore configuration file to the root directory of dump path. It provides the device and
* ms_version information.
*/
void DumpJsonParser::CopyMSCfgJsonToDir(uint32_t rank_id) {
if (!IsDumpEnabled()) {
return;
@ -217,6 +242,12 @@ bool DumpJsonParser::DumpEnabledForIter() const {
return ((e2e_dump_enabled_ || async_dump_enabled_) && IsDumpIter(cur_dump_iter_));
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Dump data in the given address into npy file.
*/
bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, size_t len, const ShapeVector &shape,
TypeId type) {
if (filename.empty() || data == nullptr || len == 0) {
@ -595,6 +626,12 @@ void DumpJsonParser::JudgeDumpEnabled() {
JsonConfigToString();
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Check if the given op needs to be dumped based the configuration option.
*/
bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
bool need_dump = false;
switch (dump_mode_) {
@ -617,6 +654,12 @@ bool DumpJsonParser::NeedDump(const std::string &op_full_name) const {
return need_dump;
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Increment the count of dumping for given kernel.
*/
void DumpJsonParser::MatchKernel(const std::string &kernel_name) {
auto iter = kernels_.find(kernel_name);
if (iter == kernels_.end()) {
@ -637,6 +680,12 @@ void DumpJsonParser::PrintUnusedKernel() {
}
}
/*
* Feature group: Online debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Generate the directory path where overflow bin file locates.
*/
std::string DumpJsonParser::GetOpOverflowBinPath(uint32_t graph_id) const {
std::string bin_path;
bin_path.append(path_);
@ -674,6 +723,12 @@ bool DumpJsonParser::OutputNeedDump() const {
return input_output_ == kDumpInputAndOutput || input_output_ == kDumpOutputOnly;
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Obtain the cell dump flag of each operators in the given kernel graph.
*/
void DumpJsonParser::GetCellDumpFlag(const session::KernelGraph &kernel_graph) {
if (dump_mode_ != DUMP_KERNELS_WITH_FLAG) {
return;

View File

@ -37,6 +37,14 @@ uint32_t ConvertPhysicalDeviceId(uint32_t device_id) {
return kernel_runtime->device_id();
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Generate dir path to dump data. It will be in these formats:
* 1) tensor/statistic: /dump_path/rank_{rank_id}/{net_name}/{graph_id}/{iter_num}.
* 2) constant data: /dump_path/rank_{rank_id}/{net_name}/{graph_id}/constants/.
*/
std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id, bool is_cst) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
std::string net_name = dump_json_parser.net_name();
@ -66,6 +74,12 @@ void GetFileKernelName(NotNull<std::string *> kernel_name) {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU and CPU.
* Runtime category: Old runtime, MindRT.
* Description: Get the actual tensor shape for dumping based on trans_flag option in configuration json file.
*/
void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *> int_shapes, bool trans_flag) {
if (trans_flag) {
*int_shapes = trans::GetRuntimePaddingShape(node, index);
@ -76,6 +90,12 @@ void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, CPU.
* Runtime category: Old runtime, MindRT.
* Description: Dump the data in memory into file path.
*/
void DumpMemToFile(const std::string &file_path, const device::DeviceAddress &addr, const ShapeVector &int_shapes,
const TypeId &type, bool trans_flag) {
auto format = kOpFormat_DEFAULT;
@ -92,6 +112,12 @@ uint64_t GetTimeStamp() {
return timestamp;
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU, CPU.
* Runtime category: Old runtime, MindRT.
* Description: Remove scope from operator name. The default separator is "--".
*/
std::string GetOpNameWithoutScope(const std::string &fullname_with_scope, const std::string &separator) {
std::size_t found = fullname_with_scope.rfind(separator);
std::string op_name;
@ -101,6 +127,13 @@ std::string GetOpNameWithoutScope(const std::string &fullname_with_scope, const
return op_name;
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU, CPU.
* Runtime category: Old runtime, MindRT.
* Description: Dump string content into file path. Current purpose is to save operator overflow information in json
* file in ascend a+m dump mode.
*/
void DumpToFile(const std::string &file_name, const std::string &dump_str) {
if (dump_str.empty()) {
MS_LOG(ERROR) << "Failed to dump empty tensor data.";

View File

@ -102,6 +102,12 @@ bool E2eDump::IsDeviceTargetGPU() {
return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
}
/*
* Feature group: Dump.
* Target device group: GPU.
* Runtime category: Old runtime, MindRT.
* Description: This function is for dumping tensor in memory to disk in GPU machine.
*/
void E2eDump::DumpGPUMemToFile(const std::string &file_path, const std::string &original_kernel_name,
const device::DeviceAddress &addr, const ShapeVector &int_shapes,
const TypeId &host_type, const TypeId &device_type, bool trans_flag, size_t slot,
@ -397,6 +403,13 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m
dump_json_parser.UpdateDumpIter();
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: This function is for updating dump iteration for GPU and ascend old runtime and ascend super
* kernel MindRT.
*/
void E2eDump::DumpSetup(const session::KernelGraph *graph) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph));
@ -406,11 +419,25 @@ void E2eDump::DumpSetup(const session::KernelGraph *graph) {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: This function is for updating dump iteration for GPU and kernel by kernel ascend MindRT dump.
*/
void E2eDump::UpdateIterMindRTDump() {
// update dump iter for GPU and kernel by kernel ascend dump.
DumpJsonParser::GetInstance().UpdateDumpIter();
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Generates graph history files (dumping all the iteration numbers in which the graph was executed) for
* the given graph and rank_id. If dataset_sink_mode is true for async dump in ascend, this function is called once per
* each epoch and dumps all the iterations in the epoch to the graph history file.
*/
void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
auto &json_parser = DumpJsonParser::GetInstance();
if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) {
@ -454,6 +481,13 @@ void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
ChangeFileMode(file_name, S_IRUSR);
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: This function is for dumping the whole graph. It is used for old runtime in GPU and Ascend and
* super-kernel mindRT in Ascend.
*/
void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
MS_EXCEPTION_IF_NULL(graph);
bool success = false;
@ -491,6 +525,12 @@ void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: This function is for dumping a single node. It is used for mindrt in GPU and Ascend kernel-by-kernel.
*/
bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) {
bool success = false;
auto &dump_json_parser = DumpJsonParser::GetInstance();
@ -529,6 +569,13 @@ bool E2eDump::isDatasetGraph(const session::KernelGraph *graph) {
}
#ifdef ENABLE_D
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is for ascend A+M dump only. It parses and converts each slot of tensor in DumpData object
* and dump the tensor data in npy file or statistic data in csv file.
*/
void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data,
char *data_ptr) {
// dump input tensors
@ -555,6 +602,12 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
}
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: It serves for A+M dump. Save statistic of the tensor data into dump path as configured.
*/
template <typename T>
bool DumpTensorStatsIfNeeded(const std::string &dump_path, const T &tensor, char *data_ptr, const std::string &io,
uint32_t slot, const ShapeVector &shape, TypeId type) {
@ -591,6 +644,13 @@ bool DumpTensorStatsIfNeeded(const std::string &dump_path, const T &tensor, char
return stat_dump.DumpTensorStatsToFile(dump_path.substr(0, pos), data);
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: It serves for A+M dump. Parse each attributes in Dumpdata proto object from device format to mindspore
* supported format and save tensor data or statistic as configured.
*/
template <typename T>
bool E2eDump::ConvertFormatForTensorAndDump(std::string dump_path, const T &tensor, char *data_ptr,
const std::string &io, uint32_t slot) {
@ -707,6 +767,12 @@ nlohmann::json E2eDump::ParseOverflowInfo(char *data_ptr) {
return overflow_info;
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is for Ascend A+M dump. It parses and dump op overflow info in json file.
*/
void E2eDump::DumpOpDebugToFile(const std::string &dump_path, const debugger::dump::DumpData &dump_data,
char *data_ptr) {
std::string out_path = dump_path + ".output.";

View File

@ -71,6 +71,13 @@ DebugServices &DebugServices::operator=(const DebugServices &other) {
return *this;
}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Create a watchpoint_t object and set the watchpoint's variables and add the watchpoint to the
* watchpoint_table.
*/
void DebugServices::AddWatchpoint(
unsigned int id, unsigned int watch_condition, float parameter,
const std::vector<std::tuple<std::string, bool>> &check_node_list, const std::vector<parameter_t> &parameter_list,
@ -83,9 +90,11 @@ void DebugServices::AddWatchpoint(
watchpoint_item.condition.type = static_cast<CONDITION_TYPE>(watch_condition);
watchpoint_item.condition.parameter = parameter;
watchpoint_item.check_node_list = check_node_list;
// For offline debugger check_node_device_list is not nullptr.
if (check_node_device_list != nullptr) {
watchpoint_item.check_node_device_list = *check_node_device_list;
}
// For offline debugger check_node_graph_list is not nullptr.
if (check_node_graph_list != nullptr) {
watchpoint_item.check_node_graph_list = *check_node_graph_list;
}
@ -98,6 +107,13 @@ void DebugServices::RemoveWatchpoint(unsigned int id) {
(void)watchpoint_table_.erase(id);
}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns a tensor summary unique pointer based on the given tensor_dtype, returns nullptr if the type is
* not supported.
*/
std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData> &tensor,
const void *const previous_tensor_ptr, uint32_t num_elements,
uint32_t prev_num_elements, int tensor_dtype) {
@ -160,6 +176,12 @@ std::unique_ptr<ITensorSummary> GetSummaryPtr(const std::shared_ptr<TensorData>
}
}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns TensorStat for the given tensor based on the base_summary_ptr.
*/
DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_ptr<TensorData> &tensor) {
if (tensor == nullptr) {
MS_LOG(WARNING) << "Tensor is nullptr, returning empty tensor statistics.";
@ -184,7 +206,15 @@ DebugServices::TensorStat DebugServices::GetTensorStatistics(const std::shared_p
return tensor_stat_data;
}
#ifdef OFFLINE_DBG_MODE
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns previous_tensor_ptr if graph hisotry file is found and the current iteration is not the first
* run iteration for tensor's graph.
*/
const void *DebugServices::GetPrevTensor(const std::shared_ptr<TensorData> &tensor, bool previous_iter_tensor_needed,
uint32_t *prev_num_elements, bool *history_not_found) {
MS_EXCEPTION_IF_NULL(tensor);
@ -309,6 +339,13 @@ void DebugServices::SetCheckWatchpointsResult(
}
#ifdef OFFLINE_DBG_MODE
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Sets and checks the OUT_OF_MEMORY error_code (for memory limit feature) and NO_VALUE error_code (for
* new python API feature). Sets checkwatchpoint results.
*/
void DebugServices::CheckOutofMemoryandNoValue(
const bool no_mem_to_read, const bool error_on_no_value, const std::vector<watchpoint_t> watchpoints_to_check,
int chunk_id, partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
@ -339,6 +376,14 @@ void DebugServices::CheckOutofMemoryandNoValue(
}
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: After finishing checking watchpoint, set the tensor to not-in-use status (for memory control
* feature) by pushing it to eviction candidate queue. So it can be evicted from memory anytime if the memory is
* required by other nodes' checking. If previous_tensor exists, change their status in a pair.
*/
void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tensor, const void *previous_tensor_ptr) {
// set the tensor into not-in-use status in tensor_loader.
auto tensor_name = tensor->GetName();
@ -353,6 +398,16 @@ void DebugServices::SetTensorToNotInUse(const std::shared_ptr<TensorData> &tenso
#endif
#ifdef ONLINE_DBG_MODE
/*
* Feature group: Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Compares the current root graph id with the given graph id and returns false if they are not equal
* for GPU mindRT and Ascend. Otherwise, it returns true. The objectives of this function are: 1) Check if tensor's
* root_graph_id is different from current_root_graph_id and skip checkwatchpoint for the tensor if these values are
* different. 2) Set prev_tensor_ptr to nullptr if current_root_graph_id is different from prev_root_graph_id. 3) Skip
* reading tensor if tensor's root_graph_id is different from current_root_graph_id.
*/
bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
auto debugger = Debugger::GetInstance();
auto ms_context = MsContext::GetInstance();
@ -368,6 +423,13 @@ bool DebugServices::CompareCurrentRootGraph(uint32_t id) {
return true;
}
/*
* Feature group: Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns the previous tensor pointer if the current root graph id is equal to previous root graph id and
* prev_tensor_data is not nullptr.
*/
const void *DebugServices::PreparePrevTensor(uint32_t *prev_num_elements, const std::string &tensor_name) {
std::shared_ptr<TensorData> prev_tensor_data;
if (!CompareCurrentRootGraph(Debugger::GetInstance()->GetPrevRootGraphId())) {
@ -391,6 +453,15 @@ void DebugServices::CheckHistoryErrorCode(int *error_code, bool history_not_foun
*error_code = ITensorSummary::HISTORY_NOT_FOUND; // error code for history not found
}
}
/*
* Feature group: Offline debugger, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: For all the tensors in the given chunk, reads the tensors, checks all the watchpoints and sets the
* watchpoint hit result. Checkwatchpoint process might be affected by memory limit, whether the read tensor was
* successfully and whether we have a multi root graph scenario. All of aforementioned checks are done in this function.
*/
void DebugServices::CheckWatchpointsForTensor(
partitioned_names *const chunk_names, partitioned_names *const chunk_slots,
partitioned_numbers *const chunk_conditions, partitioned_id *const chunk_watchpoint_id,
@ -501,6 +572,14 @@ void DebugServices::CheckWatchpointsForTensor(
}
}
/*
* Feature group: Offline debugger, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: This function checks the watchpoints for the given tensor list by dividing the tensor list into chunks.
* Each chunk is handled by a separate thread and then the result of check watchpoint for each thread is gathered and
* sorted. In the end, the time for checking the watchpoint in the current step is reported.
*/
void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::vector<std::string> *const slot,
std::vector<int> *const condition, std::vector<unsigned int> *const watchpoint_id,
std::vector<std::vector<parameter_t>> *const parameters,
@ -574,6 +653,13 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::
MS_LOG(INFO) << "CheckWatchpoints Took: " << ms_double.count() / 1000 << "s";
}
/*
* Feature group: Offline debugger, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Sorts the result of watchpoint hit for the online and offline debugger. This sorting for the online
* debugger is based on the execution order and for the offline debugger is based on the time stamp.
*/
void DebugServices::SortWatchpointsInfo(
std::vector<std::future<void>> *const tensor_future_vec, std::vector<int> *const exec_order,
std::vector<std::string> *const time_stamps, uint64_t *const tensor_list_byte_size,
@ -632,6 +718,15 @@ void DebugServices::SortWatchpointsInfo(
}
#ifdef OFFLINE_DBG_MODE
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Read tensor info from the given file. If memory control feature is configured to be enabled, it checks
* if the tensor can fit in memory before reading. There are two situations to return false: 1)tensor size is greater
* than the total preset memory limit. 2) Evicting all NOT-In-USE tensors from tensor_list_map_ cannot make enough room
* for the tensor.
*/
void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std::string &file_name,
std::string *const tensor_type, std::size_t *const size,
std::vector<int64_t> *const shape, std::vector<char> **const data_buffer,
@ -712,6 +807,13 @@ void DebugServices::ReadTensorFromNpy(const std::string &tensor_name, const std:
}
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is to convert files in each directory from device format to host format and append the
* converted npy file name into AsyncFilePool. It's for Ascend async dump only.
*/
void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFilePool *const result_list) {
std::string file_format = "npy";
for (auto const &d : dir_to_files_map) {
@ -731,7 +833,7 @@ void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFil
}
MS_LOG(INFO) << "Number of files to convert: " << files_to_convert_in_dir.size();
if (!files_to_convert_in_dir.empty()) {
// Look for the installation path to the conver_async package. If not found, throw exception and terminate the
// Look for the installation path to the convert_async package. If not found, throw exception and terminate the
// later task.
{
pybind11::gil_scoped_acquire acquire;
@ -748,6 +850,13 @@ void DebugServices::ConvertToHostFormat(const DirMap &dir_to_files_map, AsyncFil
}
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is to iterate through dump directory (dump_key) and search all the converted npy files and
* append into AsyncFilePool. It's for Ascend async dump only.
*/
void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &files_after_convert_in_dir,
const std::string &dump_key, AsyncFilePool *const result_list,
const std::string &file_format) {
@ -786,6 +895,14 @@ void DebugServices::ProcessConvertToHostFormat(const std::vector<std::string> &f
(void)closedir(d_handle);
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Node name string prefixes with scope and separates with slash "/". While the npy files in the tensor
* dump path do not include scope in their name. The objective of this function is to remove scope from the node name to
* match the file.
*/
std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
if (dump_style_name.empty()) {
return "";
@ -799,6 +916,14 @@ std::string GetNodeNameWithoutScope(const std::string &dump_style_name) {
return dump_style_name.substr(last_scope_marker + delim.size());
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is to search and prepare the target npy file to be read for each node. If the found file
* is already npy format, push it to AsyncFilePool; Otherwise, use conversion tool in convert_async.py to transfer it to
* npy format beforehand.
*/
void DebugServices::ConvertReadTensors(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id, AsyncFilePool *const result_list) {
@ -949,6 +1074,13 @@ void DebugServices::GetTensorDataInfoAsync(const std::vector<std::tuple<std::str
}
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: For the two possible modes (rank and graph), this function returns the rank_id or graph_id extracted
* from the given directory name otherwise, it returns UINT32_MAX to identify an invalid rank or graph id.
*/
uint32_t GetRankOrGraphId(const std::string &mode, const std::string &name) {
std::regex re;
if (mode == "rank") {
@ -994,6 +1126,13 @@ std::vector<uint32_t> DebugServices::GetDumpRankIdList() {
return rank_id_list;
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Searches the current dump directory and for each rank_id in rank_id_list extracts the existing
* graph_ids. Then the history file is read for all the extracted graph_ids.
*/
void DebugServices::CheckDumpGraphIdList(std::vector<uint32_t> rank_id_list) {
std::string net_name = GetNetName();
std::string dump_dir = GetDumpDir();
@ -1038,6 +1177,13 @@ void DebugServices::SetGraphsHistory() {
CheckDumpGraphIdList(rank_id_list);
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Reads the graph history file (containing iteration numbers in which the graph was executed) and stores
* the data in graphs_run_history_ for the given rank and graph id.
*/
void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id) {
std::tuple<uint32_t, uint32_t> rank_and_graph(rank_id, root_graph_id);
if (graphs_run_history_.find(rank_and_graph) != graphs_run_history_.end()) {
@ -1060,6 +1206,14 @@ void DebugServices::ReadGraphsHistory(uint32_t rank_id, uint32_t root_graph_id)
(void)closedir(d_handle);
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns a map with a tuple as the key (rank, graph) and a vector as the value. This vector contains a
* tuple with two elements, the first element is the node name and the second element is whether the node is output or
* not.
*/
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> DebugServices::GetAllWpNodes() {
std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, bool>>> rank_and_graph_to_nodes;
for (auto w_table_item : watchpoint_table_) {
@ -1081,6 +1235,13 @@ std::map<std::tuple<uint32_t, uint32_t>, std::vector<std::tuple<std::string, boo
return rank_and_graph_to_nodes;
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: For the given graph and rank id, reads the graph history file, stores all the run iterations for the
* graph in a vector and inserts it to graphs_run_history_ map.
*/
void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t, uint32_t> rank_and_graph) {
std::ifstream infile;
std::string line;
@ -1106,6 +1267,13 @@ void DebugServices::ReadGraphRunIter(std::string file_path, std::tuple<uint32_t,
std::pair<std::tuple<uint32_t, uint32_t>, std::vector<uint32_t>>(rank_and_graph, run_iters_vec));
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Creates a tensor_data object and sets its variables based on the function arguments and add the tensor
* to the tensor_list_map_.
*/
void DebugServices::AddToTensorData(const std::string &backend_name, const std::string &time_stamp,
const std::size_t slot, const unsigned int iteration, const unsigned int device_id,
const unsigned int root_graph_id, const bool is_output, const std::size_t data_size,
@ -1139,6 +1307,13 @@ void DebugServices::AddToTensorData(const std::string &backend_name, const std::
result_list->push_back(tensor_data);
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Generate a string in format of {no-scope-op-name}.{input-output}.{slot} to check and match files to
* read.
*/
void DebugServices::SetPrefixToCheck(std::string *const prefix_dump_file_name, std::string *const slot_string_to_check,
std::string *const dump_style_kernel_name, size_t slot, bool is_output) {
std::string dump_style_name_part = *dump_style_kernel_name;
@ -1179,6 +1354,13 @@ std::string GetTimeStampStr(std::string file_path) {
return "";
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Search files in dir (sync mode) or in AsyncFilePool (async mode) for the one that meets the filename
* prefix and read the file into memory.
*/
void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std::vector<size_t> slot,
std::vector<unsigned int> device_id, std::vector<unsigned int> iteration,
std::vector<unsigned int> root_graph_id, const std::vector<bool> &is_output,
@ -1216,7 +1398,6 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
}
MS_LOG(INFO) << "specific_dump_dir " << specific_dump_dir;
// search files in dir for the one that meets the filename prefix and read the file into memory
if (is_sync_mode_ || is_cst) {
ReadDumpedTensorSync(prefix_dump_file_name, specific_dump_dir, backend_name[i], slot[i], device_id[i],
iteration[i], root_graph_id[i], is_output[i], result_list, no_mem_to_read);
@ -1227,7 +1408,14 @@ void DebugServices::ReadDumpedTensor(std::vector<std::string> backend_name, std:
}
}
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: For both sync and async dump, gets the newest matched file path and reads the npy file and add the
* tenosr_data object to tensor_list_map_. If there is no matched file, an empty tensor_data object is created with
* data_size = 0, empty shape and nullptr buffer.
*/
void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<std::string> &matched_paths,
const std::string &backend_name, const unsigned int device_id,
const unsigned int root_graph_id, const bool &is_output, size_t slot,
@ -1254,6 +1442,13 @@ void DebugServices::ReadFileAndAddToTensor(const bool found, const std::vector<s
}
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Looks for the files that match the node_name (in the dump directory) for sync dump, read the newest file
* and add the related tensor_data object.
*/
void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_name, const std::string &specific_dump_dir,
const std::string &backend_name, size_t slot, const unsigned int device_id,
unsigned int iteration, unsigned int root_graph_id, const bool &is_output,
@ -1296,6 +1491,13 @@ void DebugServices::ReadDumpedTensorSync(const std::string &prefix_dump_file_nam
no_mem_to_read, iteration, result_list);
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Iterates through all the file paths in the async_file_pool and looks for the files that match the
* node_name for async dump, read the newest file and add the related tensor_data object.
*/
void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir, const std::string &prefix_dump_to_check,
const std::string &slot_string_to_check, const std::string &backend_name,
size_t slot, unsigned int device_id, unsigned int iteration,
@ -1322,6 +1524,15 @@ void DebugServices::ReadDumpedTensorAsync(const std::string &specific_dump_dir,
iteration, result_list);
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Obtain opname, output_str and slot from the npy file. Make sure its return value is the same as
* SetPrefixToCheck(). The input/output examples look like:
* input: {op_type}.{op_name}.{task_id}.{stream_id}.{timestamp}.{output_or_input_string}.{slot}.{format}.npy
* output: {op_name}.{output_or_input_string}.{slot}
*/
std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
// strip off the task_id, stream_id, and timestamp, then compare
size_t first_dot = file_name.find(".");
@ -1349,6 +1560,15 @@ std::string DebugServices::GetStrippedFilename(const std::string &file_name) {
return stripped_file_name;
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Gets a list of the nodes that should be monitored, creates a vector called proto_to_dump with nodes'
* original names and dump style names. Then, for each node, it creates an empty tensor_data object with data_byte_size
* = 0 and data_ptr = nullptr and add it to the tensor_list (for both sync and async dump). This tensor_list is used for
* checkwatchpoint functions.
*/
std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(unsigned int iteration,
AsyncFilePool *const async_file_pool,
bool error_on_no_value) {
@ -1405,6 +1625,13 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::ReadNeededDumpedTensors(
return tensor_list;
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Iterates through the dump directory and for each file it looks for a match in the file name with node
* names in proto_to_dump vector.
*/
void DebugServices::ProcessTensorDataSync(const std::vector<std::tuple<std::string, std::string>> &proto_to_dump,
const std::string &specific_dump_dir, unsigned int iteration,
unsigned int device_id, unsigned int root_graph_id,
@ -1463,6 +1690,13 @@ std::string DebugServices::IterationString(unsigned int iteration) {
}
#endif
/*
* Feature group: Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Searches for tensor in the loaded tensors, if the tensor is found and tensor's root_graph_id is equal to
* current root_graph_id, it updates the given vectors.
*/
void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *const ret_name,
std::vector<const char *> *const data_ptr, std::vector<ssize_t> *const data_size,
std::vector<unsigned int> *const dtype,
@ -1557,6 +1791,14 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, boo
return tensor_loader_->LoadNewTensor(tensor, keep_prev);
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns the previous iteration in which tensor's graph was executed, if the current step is the first
* run iteration for the graph or graph history file is not available it returns UINT32_MAX to identify invalid
* prev_iteration.
*/
uint32_t DebugServices::GetPrevIteration(const std::shared_ptr<TensorData> &tensor) {
uint32_t prev_iter;
uint32_t rank_id = tensor->GetDeviceId();
@ -1704,6 +1946,13 @@ void DebugServices::AddOpOverflowOpNames(const std::string overflow_bin_path, st
}
}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Checks whether for the given node the operator overflow happened or not by checking the overflow
* directory.
*/
bool DebugServices::CheckOpOverflow(std::string node_name_to_find, unsigned int device_id, unsigned int root_graph_id,
unsigned int iteration) {
std::string overflow_bin_path = "";

View File

@ -257,6 +257,12 @@ bool Debugger::CheckDebuggerPartialMemoryEnabled() const {
return false;
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT
* Description: Returns true if online debugger or dump is enabled.
*/
bool Debugger::DebuggerBackendEnabled() const { return CheckDebuggerDumpEnabled() || CheckDebuggerEnabled(); }
void Debugger::Reset() {
@ -284,6 +290,13 @@ void Debugger::Reset() {
MS_LOG(INFO) << "Release Debugger resource.";
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Sets root_graph_id for all the graphs in the compiled graph list. Sets cur_root_graph_id_ and
* prev_root_graph_id_ and calls PreExecute function for all the graphs.
*/
void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
// MindRTBackend for GPU and Ascend
if (device_target_ == kCPUDevice) {
@ -308,12 +321,25 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
}
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: When async dump is enabled and dataset_sink_mode is true, graph_iter_num_map_ stores the number of
* iterations per epoch for each running graph.
*/
void Debugger::UpdateGraphIterMap(uint32_t graph_id, int32_t iter_num) {
if (graph_iter_num_map_.find(graph_id) == graph_iter_num_map_.end()) {
graph_iter_num_map_[graph_id] = iter_num;
}
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend.
* Runtime category: Old runtime.
* Description: For Ascend old runtime, this function sets the current and previous root graph id.
*/
void Debugger::SetCurrentAndPrevRootGraph(uint32_t root_graph_id) {
// for GPU and ascend MindRT root graphs are set in PreExecuteGraphDebugger.
if (device_target_ != kAscendDevice || MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
@ -325,8 +351,15 @@ void Debugger::SetCurrentAndPrevRootGraph(uint32_t root_graph_id) {
<< " for step: " << num_step_ << ".";
}
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Runtime category: Old runtime.
* Description: In the case of GPU old runtime and when we have multiple subgraphs, we use the first run graph id to
* update the step number.
*/
void Debugger::StoreRunGraphIdList(uint32_t graph_id) {
// collect rungrap_ids to update step number in multigraph case
// collect rungrap_ids to update step number in multigraph case for GPU old runtime
if (!rungraph_id_list_.size()) {
rungraph_id_list_.push_back(graph_id);
@ -337,6 +370,13 @@ void Debugger::StoreRunGraphIdList(uint32_t graph_id) {
}
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Sets previous and current root_graph_id for Ascend old runtime, sends graphs to online debugger when
* debugger_enabled_ is true.
*/
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
MS_EXCEPTION_IF_NULL(graph_ptr);
// access lock for public method
@ -386,6 +426,12 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
suspended_at_last_kernel_ = false;
}
/*
* Feature group: Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Sends all the subgraphs to online debugger when debugger_enabled_ is true.
*/
void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
// only try to enable debugger if they are not all dataset graphs
if (!debugger_enabled_) {
@ -407,6 +453,12 @@ void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns true for e2e dump if dump is enabled for the current iteration.
*/
bool Debugger::DumpDataEnabledIteration() const {
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (!dump_json_parser.e2e_dump_enabled()) {
@ -420,6 +472,12 @@ bool Debugger::DumpDataEnabledIteration() const {
return false;
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Returns the rank_id for GPU and Ascend kernel-bykernel mindRT.
*/
uint32_t Debugger::GetRankID() {
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
@ -431,8 +489,13 @@ uint32_t Debugger::GetRankID() {
return rank_id;
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Dumps graph history and parameters for GPU and Ascend kernel-by-kernel MindRT. DumpConstantData for GPU.
*/
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
// only for GPU and kernel by kernel ascend (mindRT).
if (!(ascend_kernel_by_kernel_ || device_target_ == kGPUDevice)) {
return;
}
@ -461,6 +524,12 @@ void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Dumps a single node for given graph_id.
*/
void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) {
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
uint32_t rank_id = GetRankID();
@ -468,8 +537,14 @@ void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) {
}
}
/*
* Feature group: Dump.
* Target device group: GPU.
* Runtime category: MindRT.
* Description: This function is used for new GPU runtime using MindRTBackend, on Ascend platform, graphs are saved in
* session_basic.
*/
void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
// This function is used for new GPU runtime using MindRTBackend, on Ascend platform, graphs are saved in other way.
if (device_target_ == kAscendDevice) {
return;
}
@ -488,6 +563,12 @@ void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
}
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU and CPU.
* Runtime category: MindRT.
* Description: Load and dump parameters and constant data, call postExecute and update dump iter.
*/
void Debugger::PostExecuteGraphDebugger() {
// On CPU, update dump iteration Parameters and consts are not dumped here
if (device_target_ == kCPUDevice) {
@ -519,6 +600,12 @@ void Debugger::PostExecuteGraphDebugger() {
}
}
/*
* Feature group: Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Send hit watchpoints, update the step number and reset loaded tensors.
*/
void Debugger::PostExecute() {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
@ -565,6 +652,13 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
return false;
}
/*
* Feature group: Online debugger.
* Target device group: GPU.
* Runtime category: Old runtime, MindRT.
* Description: Check and send watchpoint hit for a single node, suspend if a watchpoint is hit or we are continuing
* in node level.
*/
void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
@ -597,6 +691,12 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
}
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Get graph proto and add it to graph proto list and add loaded graph pointers to a list.
*/
void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
MS_EXCEPTION_IF_NULL(graph_ptr);
if (graph_ptr_ != graph_ptr) {
@ -670,6 +770,12 @@ GraphProto Debugger::GetGraphProto(const KernelGraphPtr &graph_ptr) const {
return model.graph();
}
/*
* Feature group: Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Send debugger backend heartbeat to online debugger every few seconds.
*/
void Debugger::SendHeartbeat(int32_t period) {
int num_heartbeat_fail = 0;
const int max_num_heartbeat_fail = 5;
@ -1407,6 +1513,12 @@ bool Debugger::CheckIp(const std::string &host) const {
uint32_t Debugger::GetFirstRunGraphId() const { return rungraph_id_list_.front(); }
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Load a single parameter or value node.
*/
void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id) {
MS_EXCEPTION_IF_NULL(anf_node);
if (!anf_node->isa<Parameter>() && !anf_node->isa<ValueNode>()) {
@ -1450,6 +1562,12 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Load all the parameters and value nodes for the last loaded graph.
*/
void Debugger::LoadParametersAndConst() {
if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
MS_EXCEPTION_IF_NULL(graph_ptr_);
@ -1469,6 +1587,12 @@ void Debugger::LoadParametersAndConst() {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Load all the parameters and value nodes for the given graph.
*/
void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
MS_EXCEPTION_IF_NULL(graph);
@ -1488,6 +1612,12 @@ void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
}
}
/*
* Feature group: Online debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Load all the kernels for the last loaded graph.
*/
void Debugger::LoadGraphOutputs() {
if (!(debugger_enabled() && device_target_ == kAscendDevice)) return;
MS_EXCEPTION_IF_NULL(graph_ptr_);
@ -1528,6 +1658,12 @@ void Debugger::LoadGraphOutputs() {
}
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: MindRT.
* Description: Load a single node for kernel-by-kernel ascend mindRT dump.
*/
void Debugger::LoadNodeOutputs(const CNodePtr &node, uint32_t exec_order, uint32_t root_graph_id) {
if (device_target_ != kAscendDevice) {
return;
@ -1563,10 +1699,15 @@ void Debugger::LoadNodeOutputs(const CNodePtr &node, uint32_t exec_order, uint32
}
}
/*
* Feature group: Online debugger.
* Target device group: GPU.
* Runtime category: Old runtime.
* Description: Update step number if we are processing the first graph (to support multigraph).
*/
void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(debugger_);
// update step number if we are processing the first graph (to support multigraph)
if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
(graph->graph_id() == debugger_->GetFirstRunGraphId())) {
// access lock for public method
@ -1575,8 +1716,13 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
}
}
/*
* Feature group: Online debugger.
* Target device group: GPU.
* Runtime category: MindRT.
* Description: Update step number when DebugActor::DebugOnStepEnd is called at the end of each step.
*/
void Debugger::UpdateStepNumGPU() {
// UpdateStepNum with DebugActor::DebugOnStepEnd
if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
@ -1600,6 +1746,13 @@ bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
}
#ifdef ENABLE_D
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Load DumpDataBuilder object from dump_data_construct_map_ for tracking data chunks of node_name. It's
* for Ascend a + m dump. If not found, create a new one for it and add to dump_data_construct_map_.
*/
std::shared_ptr<DumpDataBuilder> Debugger::LoadDumpDataBuilder(const std::string &node_name) {
auto iter = dump_data_construct_map_.find(node_name);
if (iter == dump_data_construct_map_.end()) {

View File

@ -41,8 +41,13 @@ using KernelGraph = mindspore::session::KernelGraph;
using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
namespace mindspore {
/*
* Feature group: Online debugger.
* Target device group: GPU.
* Runtime category: MindRT.
* Description: Returns a vector containing real output number.
*/
std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
// define a vector containing real output number
std::vector<size_t> real_outputs;
// P.BatchNorm is used for training and inference
// can add the filter list for more operators here....
@ -58,6 +63,12 @@ std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &
return real_outputs;
}
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Runtime category: MindRT.
* Description: Get kernel inputs from launch_info and load the inputs from device to host.
*/
void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
uint32_t root_graph_id) {
// get inputs
@ -86,6 +97,12 @@ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint
}
}
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Runtime category: MindRT.
* Description: Get kernel outputs from launch_info and load the inputs from device to host.
*/
void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
uint32_t root_graph_id) {
// get outputs
@ -116,6 +133,13 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uin
}
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Returns true if the node needs to be read for Dump or online debugger. This function is used by GPU
* and Ascend kernel-by-kernel mindRT.
*/
bool CheckReadData(const CNodePtr &cnode) {
auto debugger = Debugger::GetInstance();
if (!debugger) {
@ -136,6 +160,13 @@ bool CheckReadData(const CNodePtr &cnode) {
return read_data;
}
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Runtime category: MindRT.
* Description: Load inputs and outputs of the given node if needed and dump them if dump is enabled, then it performs
* PostExecuteNode function on the given node.
*/
void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order) {
auto debugger = Debugger::GetInstance();
if (!debugger) {
@ -167,6 +198,12 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
debugger->PostExecuteNode(cnode, last_kernel);
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: MindRT.
* Description: Load outputs of the given node and dump them if dump is enabled for Ascend kernel-by-kernel dump.
*/
void ReadDataAndDumpAscend(const CNodePtr &cnode, uint32_t exec_order) {
auto debugger = Debugger::GetInstance();
if (!debugger) {
@ -192,6 +229,13 @@ void ReadDataAndDumpAscend(const CNodePtr &cnode, uint32_t exec_order) {
}
}
/*
* Feature group: Dump, Online Debugger.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Returns the error_info when sink_mode is true and we are in online debugger mode or dump mode for
* GPU, if everything is normal the error_info string will be empty.
*/
std::string CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
std::string error_info = "";
bool sink_mode = ConfigManager::GetInstance().dataset_mode() || graph_ptr->IsDatasetGraph();
@ -208,6 +252,12 @@ std::string CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
return error_info;
}
/*
* Feature group: Online Debugger.
* Target device group: Ascend.
* Runtime category: MindRT.
* Description: Loads graph's outputs and parameters for Ascend super kernel mode.
*/
void LoadDataForDebugger(const KernelGraphPtr &graph_ptr) {
auto context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(context);
@ -265,6 +315,16 @@ void SuperKernelE2eDump(const KernelGraphPtr &graph) {
}
#ifdef ENABLE_D
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: It is a function to be registered to Adx server for a + m dump feature with the following steps:
* 1) Merge chunks into one memory segment after receiving all the data for one node.
* 2) Parse dump data object.
* 3) Convert data from device to host format.
* 4) Dump to disk based on configuration.
*/
int32_t DumpDataCallBack(const DumpChunk *dump_chunk, int32_t size) {
MS_LOG(DEBUG) << "ADX DumpDataCallBack is called";
string file_name = dump_chunk->fileName;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020-2021 Huawei Technologies Co., Ltd
* Copyright 2020-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -111,6 +111,13 @@ TensorSummary<T>::TensorSummary(const void *current_tensor_ptr, const void *cons
epsilon_(1.0e-9),
mean_sd_cal_enabled_(false) {}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Initialize watchpoints calculators based on the watchpoint category. Process all the elements within the
* current tensor.
*/
template <typename T>
void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoint_t> &wps) {
InitCalculators(wps);
@ -156,6 +163,12 @@ void TensorSummary<T>::SummarizeTensor(const std::vector<DebugServices::watchpoi
}
}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Calculates statistics on chunks of data.
*/
template <typename T>
void TensorSummary<T>::TensorStatistics(DbgDataType dtype_value) {
if (dtype_value == DT_BOOL) {
@ -211,6 +224,12 @@ void TensorSummary<T>::TensorStatistics(DbgDataType dtype_value) {
}
}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Process all the elements of the chunked data and calculates the statistics.
*/
template <typename T>
void TensorSummary<T>::TensorStatisticsSingleThread() {
MeanCalculator mean_calc = MeanCalculator();
@ -244,6 +263,14 @@ void TensorSummary<T>::TensorStatisticsSingleThread() {
avg_ = mean_calc.GetMean();
}
/*
* Feature group: Online debugger, Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Returns a tuple with three elements, the first element is a bool and it is true if the watchpoint is
* hit. The second element is the error_code which is set in this function and the third element is the parameter_list
* for the watchpoint.
*/
template <typename T>
std::tuple<bool, int, std::vector<DebugServices::parameter_t>> TensorSummary<T>::IsWatchpointHit(
DebugServices::watchpoint_t wp) {

View File

@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -37,6 +37,13 @@ class DumpDataBuilder {
~DumpDataBuilder() = default;
#ifdef ENABLE_D
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is for A+M dump only. In each callback, allocate memory and copy the dump chunk from
* adx. Return false if OOM.
*/
bool CopyDumpChunk(const DumpChunk *dump_chunk) {
try {
uint32_t buf_sz = dump_chunk->bufLen;
@ -50,6 +57,14 @@ class DumpDataBuilder {
return true;
}
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is for A+M dump only. When receiving the last chunk of the node (is_last_chunk = true),
* parse and construct the dump data for dumping. It does the these steps: 1) merge all chunks for the node; 2)
* parse header and protobuf string; 3) memcpy tensor data to contiguous memory segment.
*/
bool ConstructDumpData(debugger::dump::DumpData *dump_data_proto, std::vector<char> *data_ptr) {
if (chunk_list_.empty()) {
return false;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
* Copyright 2019-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -78,6 +78,13 @@ class TensorLoader {
return std::equal(suffix.rbegin(), suffix.rend(), tensor_name.rbegin());
}
/*
* Feature group: Dump, Online debugger and Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Load new tensor into tensor_list_map_ (debugger backend cache). In offline debugger, add ":prev" to
* the previous tensor's name to avoid segfault caused by wrongly evicting the tensor when memory limit is enabled.
*/
bool LoadNewTensor(std::shared_ptr<TensorData> tensor, bool keep_prev) {
lock_.lock();
auto tensor_name = tensor->GetName();
@ -124,6 +131,13 @@ class TensorLoader {
return nullptr;
}
/*
* Feature group: Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: Search and obtain TensorData for a list of tensors from tensor_list_map_ (debugger backend cache).
* Return nullptr if the tensor is not found.
*/
void SearchTensors(const std::vector<std::string> &search_list,
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
for (auto i : search_list) {
@ -147,6 +161,14 @@ class TensorLoader {
bool EnableMemoryControl() { return mem_total_ > 0; }
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: This function is for memory control feature only. When finishing using a tensor in offline debugger,
* it will be added to cache_evict_queue_ and become an eviction candidate. Once there is no memory to read in a new
* tensor, it will be evicted from cache.
*/
void AppendToCacheEvictQueue(const std::string &tensor_name) {
std::lock_guard<std::mutex> lk(mem_lock_);
if (std::find(cache_evict_queue_.begin(), cache_evict_queue_.end(), tensor_name) == cache_evict_queue_.end()) {
@ -155,6 +177,13 @@ class TensorLoader {
}
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: This function is for memory control feature only. Check if the tensor size is greater than the preset
* limit. If not, evect the candidate tensor in cache_evict_queue_ to make room for it.
*/
bool CheckMemoryAvailable(const std::string &backend_name, const uint64_t data_size) {
// 1. Check if the tensor can fit in the entire limit. If not, don't attempt any read or evictions and generate
// warning.
@ -168,6 +197,13 @@ class TensorLoader {
return ret;
}
/*
* Feature group: Offline debugger.
* Target device group: Ascend, GPU.
* Runtime category: Old runtime, MindRT.
* Description: This function is for memory control feature only. Greedily evict not-in-use tensors from cache queue.
* If no candidate in the queue, block the thread until there is any candidate available.
*/
bool CheckAndEvictTensorCache(const uint64_t data_size) {
std::string candidate_name;
uint64_t candidates_size;
@ -199,6 +235,12 @@ class TensorLoader {
void SetMemTotal(uint64_t total_mem_size) { this->mem_total_ = total_mem_size; }
#ifdef ONLINE_DBG_MODE
/*
* Feature group: Dump.
* Target device group: GPU.
* Runtime category: Old runtime, MindRT.
* Description: Load tensor data from debugger backend cache (tensor_list_map_) and dump to file in npy format.
*/
bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type,
TypeId device_type, const std::string &addr_format, size_t slot) {

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
* Copyright 2019-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -597,6 +597,12 @@ AscendDeviceAddress::~AscendDeviceAddress() {
}
#ifndef ENABLE_SECURITY
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Dump tensor data to file for e2e dump.
*/
bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, bool trans_flag) const {
bool ret = false;
@ -640,6 +646,12 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::
#endif
#ifdef ENABLE_DEBUGGER
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend.
* Runtime category: Old runtime, MindRT.
* Description: Load tensor to host and create tensor_data object for the loaded tensor.
*/
bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
uint32_t root_graph_id) const {

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
* Copyright 2019-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -139,6 +139,12 @@ void GPUDeviceAddress::ClearDeviceMemory() {
GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); }
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Runtime category: Old runtime, MindRT.
* Description: Load tensor to host and create tensor_data object for the loaded tensor.
*/
#ifdef ENABLE_DEBUGGER
bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,

View File

@ -133,6 +133,12 @@ std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &out
return real_outputs;
}
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Runtime category: Old runtime.
* Description: Load data and dump the node if needed.
*/
#ifdef ENABLE_DEBUGGER
void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
const std::vector<mindspore::kernel::AddressPtr> &kernel_inputs,
@ -743,6 +749,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
#ifdef ENABLE_DEBUGGER
bool dump_enabled = GPUKernelRuntime::DumpDataEnabledIteration();
if (!mock && debugger_) {
// Update the step number for old GPU runtime.
debugger_->UpdateStepNum(graph);
}
#endif

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019-2021 Huawei Technologies Co., Ltd
* Copyright 2019-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -355,11 +355,13 @@ void KernelRuntime::RunOpClearMemory(const session::KernelGraph &graph) const {
#ifdef ENABLE_DEBUGGER
bool KernelRuntime::DumpDataEnabled() {
// Returns true if e2e dump is enabled.
auto &dump_json_parser = DumpJsonParser::GetInstance();
return dump_json_parser.e2e_dump_enabled();
}
bool KernelRuntime::DumpDataEnabledIteration() {
// Returns true if e2e dump is enabled and current iteration must be dumped.
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (!dump_json_parser.e2e_dump_enabled()) {
return false;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -32,6 +32,13 @@
namespace mindspore {
namespace runtime {
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Load and read data for the given node if needed. Dump the node if dump is enabled and free the loaded
* memory after the dump (for GPU and ascend kernel-by-kernel).
*/
void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_info_,
const DeviceContext *device_context, OpContext<DeviceTensor> *const op_context,
const AID *from_aid) {
@ -91,6 +98,12 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
ActorDispatcher::Send(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend.
* Runtime category: MindRT.
* Description: Load data for online debugger and dump graph for e2e dump mode (Ascend super kernel mode).
*/
void DebugActor::DebugForGraph(const KernelGraphPtr &graph, const DeviceContext *device_context,
OpContext<DeviceTensor> *const op_context, const AID *from_aid) {
MS_EXCEPTION_IF_NULL(graph);
@ -109,6 +122,12 @@ void DebugActor::DebugForGraph(const KernelGraphPtr &graph, const DeviceContext
ActorDispatcher::Send(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
* Runtime category: MindRT.
* Description: Checks dataset_sink_mode and generates the related error if any exist and calls PreExecuteGraphDebugger.
*/
void DebugActor::DebugOnStepBegin(std::vector<KernelGraphPtr> graphs, std::vector<DeviceContext *> device_contexts,
OpContext<DeviceTensor> *const op_context, const AID *from_aid) {
MS_EXCEPTION_IF_NULL(op_context);
@ -144,6 +163,13 @@ void DebugActor::DebugOnStepBegin(std::vector<KernelGraphPtr> graphs, std::vecto
ActorDispatcher::Send(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU and CPU.
* Runtime category: MindRT.
* Description: Dump parameters and constants and update dump iter for CPU. Call PostExecuteGraph Debugger for GPU and
* Ascend and update step number of online debugger GPU.
*/
void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const AID *from_aid) {
MS_EXCEPTION_IF_NULL(op_context);
MS_EXCEPTION_IF_NULL(from_aid);

View File

@ -1,5 +1,5 @@
/**
* Copyright 2021 Huawei Technologies Co., Ltd
* Copyright 2021-2022 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License"){}
* you may not use this file except in compliance with the License.
@ -415,6 +415,7 @@ GraphId GraphCompiler::CompileGraph(const FuncGraphPtr &func_graph, const Device
auto graph_id = CompileGraphImpl(root_graph, device_context);
// dump all graphs.
// for ascend mindRT.
session_->DumpGraphs(all_graphs);
// Cache the backend graph output nodes to front nodes with output index.
@ -488,8 +489,10 @@ GraphId GraphCompiler::CompileGraphImpl(const KernelGraphPtr &graph, const Devic
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
// Dump graph for GPU mindRT if dump is enabled.
debugger->DumpInGraphCompiler(graph);
if (debugger && debugger->DebuggerBackendEnabled()) {
// Load graphs for GPU and Ascend mindRT.
debugger->LoadGraphs(graph);
}
#endif

View File

@ -294,6 +294,8 @@ void GraphScheduler::BuildAndScheduleGlobalActor() {
(void)actor_manager->Spawn(base_recorder_actor, true);
// Create and schedule debug actor.
// debugger_actor_need is true for CPU when e2e dump is enabled and for Ascend and GPU is true when debugger or dump
// is enabled.
#ifndef ENABLE_SECURITY
bool debugger_actor_need = DumpJsonParser::GetInstance().e2e_dump_enabled();
#endif

View File

@ -203,6 +203,13 @@ void InitMemReuseExecOrder(KernelGraph *kernel_graph) {
UnfoldRecursiveExecOrder(kernel_graph);
}
} // namespace
/*
* Feature group: Dump.
* Target device group: Ascend.
* Runtime category: MindRT.
* Description: Parse config json file and register callback to adx.
*/
#ifndef ENABLE_SECURITY
void DumpInit(uint32_t device_id) {
auto &json_parser = DumpJsonParser::GetInstance();