forked from mindspore-Ecosystem/mindspore
use origin_parameter_order to load and dump params mindRT
Refactor mindRT code Fix DumpConstantData issue
This commit is contained in:
parent
5c8d48d809
commit
f6bebc7d97
|
@ -644,7 +644,7 @@ void AscendSession::PostExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
#ifndef ENABLE_SECURITY
|
#ifndef ENABLE_SECURITY
|
||||||
DumpSetup(kernel_graph);
|
E2eDump::UpdateIterOldRTDump(kernel_graph.get());
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -1401,13 +1401,6 @@ void AscendSession::Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bo
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifndef ENABLE_SECURITY
|
#ifndef ENABLE_SECURITY
|
||||||
void AscendSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
|
||||||
MS_LOG(DEBUG) << "Start!";
|
|
||||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
|
||||||
E2eDump::DumpSetup(kernel_graph.get());
|
|
||||||
MS_LOG(DEBUG) << "Finish!";
|
|
||||||
}
|
|
||||||
|
|
||||||
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
void AscendSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||||
MS_LOG(DEBUG) << "Start!";
|
MS_LOG(DEBUG) << "Start!";
|
||||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
MS_EXCEPTION_IF_NULL(kernel_graph);
|
||||||
|
|
|
@ -114,7 +114,6 @@ class AscendSession : public SessionBasic {
|
||||||
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
|
void Execute(const std::shared_ptr<KernelGraph> &kernel_graph, bool is_task) const;
|
||||||
#ifndef ENABLE_SECURITY
|
#ifndef ENABLE_SECURITY
|
||||||
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||||
void DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
|
||||||
#endif
|
#endif
|
||||||
void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
void LoadTensor(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||||
// below functions are used for run op
|
// below functions are used for run op
|
||||||
|
|
|
@ -518,7 +518,7 @@ void GPUSession::PreExecuteGraph(const std::shared_ptr<KernelGraph> &kernel_grap
|
||||||
debugger_->PreExecute(kernel_graph);
|
debugger_->PreExecute(kernel_graph);
|
||||||
}
|
}
|
||||||
|
|
||||||
DumpSetup(kernel_graph);
|
E2eDump::UpdateIterOldRTDump(kernel_graph.get());
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
#if ENABLE_CPU && ENABLE_GPU
|
#if ENABLE_CPU && ENABLE_GPU
|
||||||
|
@ -725,12 +725,6 @@ void GPUSession::RunOpImpl(const GraphInfo &graph_info, OpRunInfo *op_run_info,
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ENABLE_DEBUGGER
|
#ifdef ENABLE_DEBUGGER
|
||||||
void GPUSession::DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
|
||||||
MS_LOG(INFO) << "Start!";
|
|
||||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
|
||||||
E2eDump::DumpSetup(kernel_graph.get());
|
|
||||||
MS_LOG(INFO) << "Finish!";
|
|
||||||
}
|
|
||||||
|
|
||||||
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
void GPUSession::Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const {
|
||||||
// Dump graph and graph history file if e2e_dump is enabled and update cur_dump_iter for GPU old runtime.
|
// Dump graph and graph history file if e2e_dump is enabled and update cur_dump_iter for GPU old runtime.
|
||||||
|
|
|
@ -94,8 +94,6 @@ class GPUSession : public SessionBasic {
|
||||||
#ifdef ENABLE_DEBUGGER
|
#ifdef ENABLE_DEBUGGER
|
||||||
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
void Dump(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
||||||
|
|
||||||
void DumpSetup(const std::shared_ptr<KernelGraph> &kernel_graph) const;
|
|
||||||
|
|
||||||
bool DumpDataEnabledIteration() const;
|
bool DumpDataEnabledIteration() const;
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -1422,6 +1422,10 @@ void KernelGraph::SetOptimizerFlag() {
|
||||||
bool KernelGraph::IsDatasetGraph() const {
|
bool KernelGraph::IsDatasetGraph() const {
|
||||||
// check if there is InitDataSetQueue node
|
// check if there is InitDataSetQueue node
|
||||||
const auto &nodes = execution_order_;
|
const auto &nodes = execution_order_;
|
||||||
|
// The size of execution_order for the dataset graph is equal to 1.
|
||||||
|
if (execution_order_.size() > 1) {
|
||||||
|
return false;
|
||||||
|
}
|
||||||
for (const auto &node : nodes) {
|
for (const auto &node : nodes) {
|
||||||
auto node_name = AnfAlgo::GetCNodeName(node);
|
auto node_name = AnfAlgo::GetCNodeName(node);
|
||||||
if (node_name == prim::kPrimInitDataSetQueue->name()) {
|
if (node_name == prim::kPrimInitDataSetQueue->name()) {
|
||||||
|
|
|
@ -26,6 +26,9 @@
|
||||||
#include "runtime/device/kernel_runtime_manager.h"
|
#include "runtime/device/kernel_runtime_manager.h"
|
||||||
#include "utils/utils.h"
|
#include "utils/utils.h"
|
||||||
#include "debug/common.h"
|
#include "debug/common.h"
|
||||||
|
#include "runtime/framework/device_tensor_store.h"
|
||||||
|
|
||||||
|
using mindspore::runtime::DeviceTensorStore;
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
uint32_t ConvertPhysicalDeviceId(uint32_t device_id) {
|
uint32_t ConvertPhysicalDeviceId(uint32_t device_id) {
|
||||||
|
@ -90,6 +93,24 @@ void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
const DeviceTensorPtr GetParameterInfo(const AnfNodePtr &node, NotNull<ShapeVector *> int_shapes,
|
||||||
|
NotNull<TypeId *> host_type, NotNull<TypeId *> device_type) {
|
||||||
|
const auto &device_tensors = DeviceTensorStore::GetInstance().Fetch(node.get());
|
||||||
|
if (device_tensors.size() < 1) {
|
||||||
|
return nullptr;
|
||||||
|
}
|
||||||
|
auto device_addr = device_tensors[0];
|
||||||
|
MS_EXCEPTION_IF_NULL(device_addr);
|
||||||
|
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||||
|
bool trans_flag = dump_json_parser.trans_flag();
|
||||||
|
auto ref_node = device_addr->GetNodeIndex().first;
|
||||||
|
MS_EXCEPTION_IF_NULL(ref_node);
|
||||||
|
GetDumpIntShape(ref_node, PARAMETER_OUTPUT_INDEX, int_shapes, trans_flag);
|
||||||
|
*host_type = AnfAlgo::GetOutputInferDataType(ref_node, PARAMETER_OUTPUT_INDEX);
|
||||||
|
*device_type = AnfAlgo::GetOutputDeviceDataType(ref_node, PARAMETER_OUTPUT_INDEX);
|
||||||
|
return device_addr;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Feature group: Dump.
|
* Feature group: Dump.
|
||||||
* Target device group: Ascend, CPU.
|
* Target device group: Ascend, CPU.
|
||||||
|
|
|
@ -19,10 +19,14 @@
|
||||||
|
|
||||||
#include <map>
|
#include <map>
|
||||||
#include <string>
|
#include <string>
|
||||||
|
#include <memory>
|
||||||
|
|
||||||
#include "backend/session/kernel_graph.h"
|
#include "backend/session/kernel_graph.h"
|
||||||
#include "runtime/device/device_address.h"
|
#include "runtime/device/device_address.h"
|
||||||
|
|
||||||
|
using DeviceTensor = mindspore::device::DeviceAddress;
|
||||||
|
using DeviceTensorPtr = std::shared_ptr<DeviceTensor>;
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
static const size_t PARAMETER_OUTPUT_INDEX = 0;
|
static const size_t PARAMETER_OUTPUT_INDEX = 0;
|
||||||
static const size_t VALUE_NODE_OUTPUT_INDEX = 0;
|
static const size_t VALUE_NODE_OUTPUT_INDEX = 0;
|
||||||
|
@ -33,6 +37,9 @@ void GetFileKernelName(NotNull<std::string *> kernel_name);
|
||||||
|
|
||||||
void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *> int_shapes, bool trans_flag = false);
|
void GetDumpIntShape(const AnfNodePtr &node, size_t index, NotNull<ShapeVector *> int_shapes, bool trans_flag = false);
|
||||||
|
|
||||||
|
const DeviceTensorPtr GetParameterInfo(const AnfNodePtr &node, NotNull<ShapeVector *> int_shapes,
|
||||||
|
NotNull<TypeId *> host_type, NotNull<TypeId *> device_type);
|
||||||
|
|
||||||
void DumpMemToFile(const std::string &file_path, const device::DeviceAddress &addr, const ShapeVector &int_shapes,
|
void DumpMemToFile(const std::string &file_path, const device::DeviceAddress &addr, const ShapeVector &int_shapes,
|
||||||
const TypeId &type, bool trans_flag = false);
|
const TypeId &type, bool trans_flag = false);
|
||||||
// Get time stamp since epoch in microseconds
|
// Get time stamp since epoch in microseconds
|
||||||
|
|
|
@ -305,7 +305,6 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
|
||||||
dump_name = node_name.substr(cst_prefix.length());
|
dump_name = node_name.substr(cst_prefix.length());
|
||||||
trans_flag = false;
|
trans_flag = false;
|
||||||
}
|
}
|
||||||
|
|
||||||
// check if output address exists, if not, return;
|
// check if output address exists, if not, return;
|
||||||
if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
|
if (!AnfAlgo::OutputAddrExist(anf_node, output_index)) {
|
||||||
return;
|
return;
|
||||||
|
@ -334,6 +333,49 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Feature group: Dump.
|
||||||
|
* Target device group: Ascend, GPU.
|
||||||
|
* Runtime category: MindRT.
|
||||||
|
* Description: This function is similar to DumpSingleAnfNode function but it is only for dumping parameters in mindRT.
|
||||||
|
* This function uses GetParameterInfo to get dump info for the parameter node.
|
||||||
|
*/
|
||||||
|
void E2eDump::DumpSingleParameterNode(const AnfNodePtr &anf_node, const std::string &dump_path, bool trans_flag,
|
||||||
|
const Debugger *debugger) {
|
||||||
|
MS_EXCEPTION_IF_NULL(anf_node);
|
||||||
|
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||||
|
std::string node_name = GetKernelNodeName(anf_node);
|
||||||
|
if (!anf_node->isa<Parameter>() || !dump_json_parser.NeedDump(node_name)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
DumpJsonParser::GetInstance().MatchKernel(node_name);
|
||||||
|
GetFileKernelName(NOT_NULL(&node_name));
|
||||||
|
ShapeVector int_shapes;
|
||||||
|
TypeId type;
|
||||||
|
TypeId device_type;
|
||||||
|
auto addr = GetParameterInfo(anf_node, NOT_NULL(&int_shapes), NOT_NULL(&type), NOT_NULL(&device_type));
|
||||||
|
if (addr == nullptr) {
|
||||||
|
MS_LOG(DEBUG) << "Skip node: " << node_name << ". Parameter data is not available for mindRT.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
uint64_t timestamp = GetTimeStamp();
|
||||||
|
uint32_t task_id = 0;
|
||||||
|
uint32_t stream_id = 0;
|
||||||
|
std::string file_path = dump_path + "/Parameter." + node_name + '.' + std::to_string(task_id) + '.' +
|
||||||
|
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output.0";
|
||||||
|
if (IsDeviceTargetGPU()) {
|
||||||
|
if (dump_json_parser.IsStatisticDump()) {
|
||||||
|
TensorStatDump stat_dump("Parameter", node_name, task_id, stream_id, timestamp, false, 0, 0);
|
||||||
|
stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
|
||||||
|
}
|
||||||
|
if (dump_json_parser.IsTensorDump()) {
|
||||||
|
DumpGPUMemToFile(file_path, node_name, *addr, int_shapes, type, device_type, trans_flag, 0, debugger);
|
||||||
|
}
|
||||||
|
} else {
|
||||||
|
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
void E2eDump::DumpParameters(const session::KernelGraph *graph, const std::string &dump_path,
|
void E2eDump::DumpParameters(const session::KernelGraph *graph, const std::string &dump_path,
|
||||||
const Debugger *debugger) {
|
const Debugger *debugger) {
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
|
@ -380,9 +422,16 @@ void E2eDump::DumpConstantData(const session::KernelGraph *graph, const std::str
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_mode) {
|
/*
|
||||||
uint32_t graph_id = graph->graph_id();
|
* Feature group: Dump.
|
||||||
|
* Target device group: Ascend, GPU.
|
||||||
|
* Runtime category: Old runtime.
|
||||||
|
* Description: This function is for updating dump iteration for GPU and ascend old runtime.
|
||||||
|
*/
|
||||||
|
void E2eDump::UpdateIterOldRTDump(const session::KernelGraph *graph) {
|
||||||
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||||
|
uint32_t graph_id = graph->graph_id();
|
||||||
if (IsDeviceTargetGPU()) {
|
if (IsDeviceTargetGPU()) {
|
||||||
if (starting_graph_id == INT32_MAX) {
|
if (starting_graph_id == INT32_MAX) {
|
||||||
starting_graph_id = graph_id;
|
starting_graph_id = graph_id;
|
||||||
|
@ -394,7 +443,7 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// If device target is Ascend
|
// If device target is Ascend
|
||||||
if (sink_mode && graph->IsDatasetGraph()) {
|
if (graph->IsDatasetGraph()) {
|
||||||
MS_LOG(INFO) << "No need to update iteration for dataset graph.";
|
MS_LOG(INFO) << "No need to update iteration for dataset graph.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
|
@ -403,29 +452,23 @@ void E2eDump::UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_m
|
||||||
dump_json_parser.UpdateDumpIter();
|
dump_json_parser.UpdateDumpIter();
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
|
||||||
* Feature group: Dump.
|
|
||||||
* Target device group: Ascend, GPU.
|
|
||||||
* Runtime category: Old runtime, MindRT.
|
|
||||||
* Description: This function is for updating dump iteration for GPU and ascend old runtime and ascend super
|
|
||||||
* kernel MindRT.
|
|
||||||
*/
|
|
||||||
void E2eDump::DumpSetup(const session::KernelGraph *graph) {
|
|
||||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
|
||||||
bool sink_mode = (ConfigManager::GetInstance().dataset_mode() || E2eDump::isDatasetGraph(graph));
|
|
||||||
|
|
||||||
if (dump_json_parser.async_dump_enabled() || dump_json_parser.e2e_dump_enabled()) {
|
|
||||||
UpdateIterDumpSetup(graph, sink_mode);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Feature group: Dump.
|
* Feature group: Dump.
|
||||||
* Target device group: Ascend, GPU.
|
* Target device group: Ascend, GPU.
|
||||||
* Runtime category: MindRT.
|
* Runtime category: MindRT.
|
||||||
* Description: This function is for updating dump iteration for GPU and kernel by kernel ascend MindRT dump.
|
* Description: This function is for updating dump iteration for GPU and ascend MindRT dump. Please note that dump with
|
||||||
|
* dataset_sink_mode = True is not supported for GPU.
|
||||||
*/
|
*/
|
||||||
void E2eDump::UpdateIterMindRTDump() {
|
void E2eDump::UpdateIterMindRTDump() {
|
||||||
|
auto debugger = Debugger::GetInstance();
|
||||||
|
// Dataset graph is always the first graph in the list when dataset_sink_mode is true.
|
||||||
|
auto graph = (debugger->GetStepGraphPtrList())[0];
|
||||||
|
auto context = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context);
|
||||||
|
if (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kAscendDevice && graph->IsDatasetGraph()) {
|
||||||
|
MS_LOG(INFO) << "No need to update iteration for dataset graph.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
// update dump iter for GPU and kernel by kernel ascend dump.
|
// update dump iter for GPU and kernel by kernel ascend dump.
|
||||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||||
}
|
}
|
||||||
|
@ -464,7 +507,7 @@ void E2eDump::DumpRunIter(const KernelGraphPtr &graph, uint32_t rank_id) {
|
||||||
MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
|
MS_LOG(WARNING) << "Open file for saving graph global execution order failed.";
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
if (sink_mode && json_parser.async_dump_enabled()) {
|
if (sink_mode && json_parser.async_dump_enabled() && !Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
|
||||||
// for async dump when sink_mode = true, cur_dump_iter() = current_epoch
|
// for async dump when sink_mode = true, cur_dump_iter() = current_epoch
|
||||||
// dump history for all iterations in the epoch
|
// dump history for all iterations in the epoch
|
||||||
Debugger::GetInstance()->UpdateGraphIterMap(graph->graph_id(), iter_num);
|
Debugger::GetInstance()->UpdateGraphIterMap(graph->graph_id(), iter_num);
|
||||||
|
@ -501,16 +544,16 @@ void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
|
||||||
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
MS_LOG(INFO) << "Start e2e dump. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||||
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
||||||
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
|
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
|
||||||
std::string cst_path = GenerateDumpPath(graph_id, rank_id, true);
|
|
||||||
|
|
||||||
if (dump_json_parser.IsStatisticDump()) {
|
if (dump_json_parser.IsStatisticDump()) {
|
||||||
TensorStatDump::OpenStatisticsFile(dump_path);
|
TensorStatDump::OpenStatisticsFile(dump_path);
|
||||||
}
|
}
|
||||||
DumpInput(graph, dump_path, debugger);
|
DumpInput(graph, dump_path, debugger);
|
||||||
DumpOutput(graph, dump_path, debugger);
|
DumpOutput(graph, dump_path, debugger);
|
||||||
DumpParameters(graph, dump_path, debugger);
|
if (!MsContext::GetInstance()->get_param<bool>(MS_CTX_ENABLE_MINDRT)) {
|
||||||
if (IsDeviceTargetGPU() && dump_json_parser.e2e_dump_enabled()) {
|
// Dump parameters for old runtime. For mindRT it is done in PostExecuteGraphDebugger.
|
||||||
DumpConstantData(graph, cst_path, debugger);
|
DumpParameters(graph, dump_path, debugger);
|
||||||
|
// DumpConstantData for GPU old runtime.
|
||||||
|
DumpConstantData(graph, rank_id, debugger);
|
||||||
}
|
}
|
||||||
if (dump_json_parser.IsStatisticDump()) {
|
if (dump_json_parser.IsStatisticDump()) {
|
||||||
CsvWriter::GetInstance().CloseFile();
|
CsvWriter::GetInstance().CloseFile();
|
||||||
|
@ -543,29 +586,29 @@ bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32
|
||||||
return success;
|
return success;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool E2eDump::DumpParametersData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger) {
|
/*
|
||||||
bool success = false;
|
* Feature group: Dump.
|
||||||
uint32_t graph_id = graph->graph_id();
|
* Target device group: Ascend, GPU.
|
||||||
|
* Runtime category: MindRT.
|
||||||
|
* Description: This function is for dumping all the parameters in the current root graph for GPU, Ascend superkernel
|
||||||
|
* (e2e dump) and Ascend kernel-by-kernel (e2e and async dump).
|
||||||
|
*/
|
||||||
|
void E2eDump::DumpParametersData(uint32_t rank_id, const Debugger *debugger) {
|
||||||
|
uint32_t root_graph_id = debugger->GetCurrentRootGraphId();
|
||||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||||
|
if (dump_json_parser.async_dump_enabled() && !debugger->GetAscendKernelByKernelFlag()) {
|
||||||
|
// Dump parameters for mindRT in async dump only for kernel by kernel mode.
|
||||||
|
return;
|
||||||
|
}
|
||||||
if (dump_json_parser.DumpEnabledForIter()) {
|
if (dump_json_parser.DumpEnabledForIter()) {
|
||||||
MS_LOG(INFO) << "DumpParameters. Current iteration is " << dump_json_parser.cur_dump_iter();
|
MS_LOG(INFO) << "DumpParameters. Current iteration is " << dump_json_parser.cur_dump_iter();
|
||||||
MS_LOG(INFO) << "Current graph id is " << graph_id;
|
MS_LOG(INFO) << "Current root graph id is " << root_graph_id;
|
||||||
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
|
std::string dump_path = GenerateDumpPath(root_graph_id, rank_id);
|
||||||
DumpParameters(graph, dump_path, debugger);
|
bool trans_flag = dump_json_parser.trans_flag();
|
||||||
success = true;
|
for (auto &item : debugger->GetParametersMindRT()) {
|
||||||
}
|
DumpSingleParameterNode(item, dump_path, trans_flag, debugger);
|
||||||
return success;
|
|
||||||
}
|
|
||||||
bool E2eDump::isDatasetGraph(const session::KernelGraph *graph) {
|
|
||||||
// check if there is GetNext or InitDataSetQueue node
|
|
||||||
const auto &nodes = graph->execution_order();
|
|
||||||
for (const auto &node : nodes) {
|
|
||||||
auto node_name = AnfAlgo::GetCNodeName(node);
|
|
||||||
if (node_name == prim::kPrimGetNext->name() || node_name == prim::kPrimInitDataSetQueue->name()) {
|
|
||||||
return true;
|
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
return false;
|
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ENABLE_D
|
#ifdef ENABLE_D
|
||||||
|
|
|
@ -38,10 +38,10 @@ class E2eDump {
|
||||||
public:
|
public:
|
||||||
E2eDump() = default;
|
E2eDump() = default;
|
||||||
~E2eDump() = default;
|
~E2eDump() = default;
|
||||||
static void DumpSetup(const session::KernelGraph *graph);
|
|
||||||
|
|
||||||
static void UpdateIterMindRTDump();
|
static void UpdateIterMindRTDump();
|
||||||
|
|
||||||
|
static void UpdateIterOldRTDump(const session::KernelGraph *graph);
|
||||||
|
|
||||||
static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
|
static void DumpRunIter(const KernelGraphPtr &graph_ptr, uint32_t rank_id = 0);
|
||||||
|
|
||||||
static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
|
static void DumpData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
|
||||||
|
@ -51,13 +51,11 @@ class E2eDump {
|
||||||
|
|
||||||
static void DumpConstantData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
|
static void DumpConstantData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger = nullptr);
|
||||||
|
|
||||||
static bool DumpParametersData(const session::KernelGraph *graph, uint32_t rank_id, const Debugger *debugger);
|
static void DumpParametersData(uint32_t rank_id, const Debugger *debugger);
|
||||||
|
|
||||||
static bool DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id,
|
static bool DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id,
|
||||||
const Debugger *debugger = nullptr);
|
const Debugger *debugger = nullptr);
|
||||||
|
|
||||||
static bool isDatasetGraph(const session::KernelGraph *graph);
|
|
||||||
|
|
||||||
// Dump data when task error.
|
// Dump data when task error.
|
||||||
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
|
||||||
std::string *kernel_name, const Debugger *debugger);
|
std::string *kernel_name, const Debugger *debugger);
|
||||||
|
@ -91,7 +89,8 @@ class E2eDump {
|
||||||
static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
|
static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
|
||||||
bool trans_flag, const Debugger *debugger);
|
bool trans_flag, const Debugger *debugger);
|
||||||
|
|
||||||
static void UpdateIterDumpSetup(const session::KernelGraph *graph, bool sink_mode);
|
static void DumpSingleParameterNode(const AnfNodePtr &anf_node, const std::string &dump_path, bool trans_flag,
|
||||||
|
const Debugger *debugger);
|
||||||
|
|
||||||
#ifdef ENABLE_D
|
#ifdef ENABLE_D
|
||||||
static nlohmann::json ParseOverflowInfo(char *data_ptr);
|
static nlohmann::json ParseOverflowInfo(char *data_ptr);
|
||||||
|
|
|
@ -38,6 +38,7 @@
|
||||||
#include "runtime/hardware/device_context_manager.h"
|
#include "runtime/hardware/device_context_manager.h"
|
||||||
#include "debug/anf_ir_dump.h"
|
#include "debug/anf_ir_dump.h"
|
||||||
#include "debug/anf_ir_utils.h"
|
#include "debug/anf_ir_utils.h"
|
||||||
|
#include "runtime/framework/device_tensor_store.h"
|
||||||
#ifdef ENABLE_DEBUGGER
|
#ifdef ENABLE_DEBUGGER
|
||||||
#include "debug/debugger/proto_exporter.h"
|
#include "debug/debugger/proto_exporter.h"
|
||||||
#else
|
#else
|
||||||
|
@ -56,6 +57,7 @@ using debugger::WatchCondition_Condition_nan;
|
||||||
using debugger::WatchCondition_Parameter;
|
using debugger::WatchCondition_Parameter;
|
||||||
using debugger::WatchNode;
|
using debugger::WatchNode;
|
||||||
using debugger::WatchpointHit;
|
using debugger::WatchpointHit;
|
||||||
|
using mindspore::runtime::DeviceTensorStore;
|
||||||
|
|
||||||
namespace mindspore {
|
namespace mindspore {
|
||||||
|
|
||||||
|
@ -287,6 +289,8 @@ void Debugger::Reset() {
|
||||||
graph_proto_list_.clear();
|
graph_proto_list_.clear();
|
||||||
graph_ptr_list_.clear();
|
graph_ptr_list_.clear();
|
||||||
graph_ptr_step_vec_.clear();
|
graph_ptr_step_vec_.clear();
|
||||||
|
parameters_mindRT_.clear();
|
||||||
|
visited_root_graph_ids_.clear();
|
||||||
MS_LOG(INFO) << "Release Debugger resource.";
|
MS_LOG(INFO) << "Release Debugger resource.";
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -297,13 +301,15 @@ void Debugger::Reset() {
|
||||||
* Description: Sets root_graph_id for all the graphs in the compiled graph list. Sets cur_root_graph_id_ and
|
* Description: Sets root_graph_id for all the graphs in the compiled graph list. Sets cur_root_graph_id_ and
|
||||||
* prev_root_graph_id_ and calls PreExecute function for all the graphs.
|
* prev_root_graph_id_ and calls PreExecute function for all the graphs.
|
||||||
*/
|
*/
|
||||||
void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs) {
|
void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs,
|
||||||
|
const std::vector<AnfNodePtr> &origin_parameters_order) {
|
||||||
// MindRTBackend for GPU and Ascend
|
// MindRTBackend for GPU and Ascend
|
||||||
if (device_target_ == kCPUDevice) {
|
if (device_target_ == kCPUDevice) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// Store graphs that are run in one step.
|
// Store graphs that are run in one step.
|
||||||
graph_ptr_step_vec_ = graphs;
|
graph_ptr_step_vec_ = graphs;
|
||||||
|
parameters_mindRT_ = origin_parameters_order;
|
||||||
prev_root_graph_id_ = cur_root_graph_id_;
|
prev_root_graph_id_ = cur_root_graph_id_;
|
||||||
// set first run graph as the root graph
|
// set first run graph as the root graph
|
||||||
cur_root_graph_id_ = graph_ptr_step_vec_[0]->graph_id();
|
cur_root_graph_id_ = graph_ptr_step_vec_[0]->graph_id();
|
||||||
|
@ -474,21 +480,31 @@ uint32_t Debugger::GetRankID() {
|
||||||
* Feature group: Dump.
|
* Feature group: Dump.
|
||||||
* Target device group: Ascend, GPU.
|
* Target device group: Ascend, GPU.
|
||||||
* Runtime category: MindRT.
|
* Runtime category: MindRT.
|
||||||
* Description: Dumps graph history and parameters for GPU and Ascend kernel-by-kernel MindRT. DumpConstantData for GPU.
|
* Description: When dump is enabled, this function: 1) Dumps parameters for the current root_graph_id to the
|
||||||
|
* root_graph's directory. 2) Dumps constant data once for each graph. 3) Dumps graph run history for each graph.
|
||||||
*/
|
*/
|
||||||
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
|
void Debugger::DumpParamsAndConstAndHistory() {
|
||||||
if (!(ascend_kernel_by_kernel_ || device_target_ == kGPUDevice)) {
|
if (!CheckDebuggerDumpEnabled()) {
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
uint32_t rank_id = GetRankID();
|
LoadParametersAllGraphs();
|
||||||
E2eDump::DumpRunIter(kernel_graph, rank_id);
|
(void)E2eDump::DumpParametersData(GetRankID(), debugger_.get());
|
||||||
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
|
// Whether constant data was already dumped for the current root graph.
|
||||||
MS_EXCEPTION_IF_NULL(kernel_graph);
|
bool cur_root_graph_checked = std::find(visited_root_graph_ids_.begin(), visited_root_graph_ids_.end(),
|
||||||
(void)E2eDump::DumpParametersData(kernel_graph.get(), rank_id, debugger_.get());
|
cur_root_graph_id_) != visited_root_graph_ids_.end();
|
||||||
// Dump constant data for GPU mindRT.
|
for (auto graph : graph_ptr_step_vec_) {
|
||||||
E2eDump::DumpConstantData(kernel_graph.get(), rank_id, debugger_.get());
|
if (!cur_root_graph_checked) {
|
||||||
} else {
|
LoadConstsForGraph(graph);
|
||||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
// Dump constant data for GPU.
|
||||||
|
E2eDump::DumpConstantData(graph.get(), GetRankID(), debugger_.get());
|
||||||
|
// Dump constant data for Ascend.
|
||||||
|
DumpConstantDataAscend(graph);
|
||||||
|
}
|
||||||
|
// Dump graph run hisotry for each graph.
|
||||||
|
E2eDump::DumpRunIter(graph, GetRankID());
|
||||||
|
}
|
||||||
|
if (!cur_root_graph_checked) {
|
||||||
|
visited_root_graph_ids_.push_back(cur_root_graph_id_);
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
@ -556,29 +572,15 @@ void Debugger::PostExecuteGraphDebugger() {
|
||||||
DumpJsonParser::GetInstance().UpdateDumpIter();
|
DumpJsonParser::GetInstance().UpdateDumpIter();
|
||||||
return;
|
return;
|
||||||
}
|
}
|
||||||
// LoadParametersAndConst for all the graphs that have been run in the current step
|
DumpParamsAndConstAndHistory();
|
||||||
if (debugger_ && device_target_ == kGPUDevice) {
|
|
||||||
for (auto graph : graph_ptr_step_vec_) {
|
|
||||||
debugger_->LoadParametersAndConst(graph);
|
|
||||||
}
|
|
||||||
}
|
|
||||||
// debug used for dump
|
// debug used for dump
|
||||||
if (debugger_ && debugger_->CheckDebuggerDumpEnabled()) {
|
if (CheckDebuggerDumpEnabled() && !debugger_enabled()) {
|
||||||
// Dump Parameters and consts
|
ClearCurrentData();
|
||||||
for (auto graph : graph_ptr_step_vec_) {
|
|
||||||
debugger_->Dump(graph);
|
|
||||||
DumpConstantDataAscend(graph);
|
|
||||||
if (!debugger_->debugger_enabled()) {
|
|
||||||
debugger_->ClearCurrentData();
|
|
||||||
}
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
if (debugger_) {
|
if (debugger_) {
|
||||||
debugger_->PostExecute();
|
debugger_->PostExecute();
|
||||||
}
|
}
|
||||||
if (ascend_kernel_by_kernel_ || device_target_ == kGPUDevice) {
|
E2eDump::UpdateIterMindRTDump();
|
||||||
E2eDump::UpdateIterMindRTDump();
|
|
||||||
}
|
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -1341,7 +1343,10 @@ bool Debugger::DumpTensorToFile(const std::string &tensor_name, bool trans_flag,
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
|
bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
|
||||||
return debug_services_.get()->LoadNewTensor(tensor, keep_prev);
|
if (debug_services_ != nullptr) {
|
||||||
|
return debug_services_.get()->LoadNewTensor(tensor, keep_prev);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Debugger::debugger_enabled() const { return debugger_enabled_; }
|
bool Debugger::debugger_enabled() const { return debugger_enabled_; }
|
||||||
|
@ -1543,6 +1548,37 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) {
|
||||||
|
MS_EXCEPTION_IF_NULL(node);
|
||||||
|
auto root_graph_id = cur_root_graph_id_;
|
||||||
|
// This function is only for loading parameters mindRT.
|
||||||
|
std::string node_name = GetKernelNodeName(node);
|
||||||
|
GetFileKernelName(NOT_NULL(&node_name));
|
||||||
|
TypeId type;
|
||||||
|
TypeId device_type;
|
||||||
|
ShapeVector int_shapes;
|
||||||
|
auto device_addr = GetParameterInfo(node, NOT_NULL(&int_shapes), NOT_NULL(&type), NOT_NULL(&device_type));
|
||||||
|
if (device_addr == nullptr) {
|
||||||
|
MS_LOG(DEBUG) << "Skip node: " << node_name << ". Parameter data is not available for mindRT.";
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
if (!IsTypeDebuggerSupported(type)) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
auto format = kOpFormat_DEFAULT;
|
||||||
|
string tensor_name = node_name + ':' + "0";
|
||||||
|
if (debug_services_ != nullptr) {
|
||||||
|
debug_services_->MoveTensorCurrentToPrev(tensor_name);
|
||||||
|
}
|
||||||
|
// Keep_prev is True for parameters.
|
||||||
|
bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id);
|
||||||
|
|
||||||
|
if (!ret) {
|
||||||
|
MS_LOG(ERROR) << "LoadMemToHost:"
|
||||||
|
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Feature group: Dump, Online debugger.
|
* Feature group: Dump, Online debugger.
|
||||||
* Target device group: Ascend, GPU.
|
* Target device group: Ascend, GPU.
|
||||||
|
@ -1593,6 +1629,43 @@ void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Feature group: Dump.
|
||||||
|
* Target device group: GPU.
|
||||||
|
* Runtime category: MindRT.
|
||||||
|
* Description: This function is for loading parameters' data from device to host into tensor_list_map_ for GPU dump.
|
||||||
|
* Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
|
||||||
|
*/
|
||||||
|
void Debugger::LoadParametersAllGraphs() {
|
||||||
|
if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
for (auto &node : parameters_mindRT_) {
|
||||||
|
LoadSingleParameterMindRT(node);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
|
/*
|
||||||
|
* Feature group: Dump.
|
||||||
|
* Target device group: GPU.
|
||||||
|
* Runtime category: MindRT.
|
||||||
|
* Description: This function is for loading constant data from device to host into tensor_list_map_ for GPU dump.
|
||||||
|
* Ascend does not use tensor_map_list_ for dump so it is not needed for ascend dump.
|
||||||
|
*/
|
||||||
|
void Debugger::LoadConstsForGraph(const KernelGraphPtr &graph) {
|
||||||
|
if (!(device_target_ == kGPUDevice && CheckDebuggerDumpEnabled())) {
|
||||||
|
return;
|
||||||
|
}
|
||||||
|
// load value nodes
|
||||||
|
// get all constant values from the graph
|
||||||
|
MS_LOG(INFO) << "Start to load value nodes for graph " << graph->graph_id() << ".";
|
||||||
|
auto root_graph_id = graph->root_graph_id();
|
||||||
|
const auto value_nodes = graph->graph_value_nodes();
|
||||||
|
for (auto &item : value_nodes) {
|
||||||
|
LoadSingleAnfnode(item, VALUE_NODE_OUTPUT_INDEX, root_graph_id);
|
||||||
|
}
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Feature group: Online debugger.
|
* Feature group: Online debugger.
|
||||||
* Target device group: Ascend.
|
* Target device group: Ascend.
|
||||||
|
@ -1683,7 +1756,10 @@ void Debugger::ClearCurrentData() {
|
||||||
}
|
}
|
||||||
|
|
||||||
bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
|
bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
|
||||||
return debug_services_->TensorExistsInCurrent(tensor_name);
|
if (debug_services_ != nullptr) {
|
||||||
|
return debug_services_->TensorExistsInCurrent(tensor_name);
|
||||||
|
}
|
||||||
|
return false;
|
||||||
}
|
}
|
||||||
|
|
||||||
#ifdef ENABLE_D
|
#ifdef ENABLE_D
|
||||||
|
|
|
@ -29,6 +29,7 @@
|
||||||
#ifdef ENABLE_D
|
#ifdef ENABLE_D
|
||||||
#include "debug/dump_data_builder.h"
|
#include "debug/dump_data_builder.h"
|
||||||
#endif
|
#endif
|
||||||
|
#include "runtime/device/device_address.h"
|
||||||
|
|
||||||
using debugger::Chunk;
|
using debugger::Chunk;
|
||||||
using debugger::DataType;
|
using debugger::DataType;
|
||||||
|
@ -41,6 +42,8 @@ using debugger::WatchCondition;
|
||||||
using debugger::WatchCondition_Parameter;
|
using debugger::WatchCondition_Parameter;
|
||||||
using debugger::WatchNode;
|
using debugger::WatchNode;
|
||||||
using debugger::WatchpointHit;
|
using debugger::WatchpointHit;
|
||||||
|
using DeviceTensor = mindspore::device::DeviceAddress;
|
||||||
|
using DeviceTensorPtr = std::shared_ptr<DeviceTensor>;
|
||||||
|
|
||||||
template <class T>
|
template <class T>
|
||||||
using ProtoVector = google::protobuf::RepeatedPtrField<T>;
|
using ProtoVector = google::protobuf::RepeatedPtrField<T>;
|
||||||
|
@ -77,7 +80,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
// reset debugger
|
// reset debugger
|
||||||
void Reset();
|
void Reset();
|
||||||
|
|
||||||
void PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs);
|
void PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs,
|
||||||
|
const std::vector<AnfNodePtr> &origin_parameters_order);
|
||||||
// enable debugger
|
// enable debugger
|
||||||
// send graph and wait for command
|
// send graph and wait for command
|
||||||
// do nothing if graph is set already
|
// do nothing if graph is set already
|
||||||
|
@ -87,6 +91,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
|
|
||||||
void SetAscendKernelByKernelFlag(bool value) { ascend_kernel_by_kernel_ = value; }
|
void SetAscendKernelByKernelFlag(bool value) { ascend_kernel_by_kernel_ = value; }
|
||||||
|
|
||||||
|
bool GetAscendKernelByKernelFlag() const { return ascend_kernel_by_kernel_; }
|
||||||
|
|
||||||
void StoreRunGraphIdList(uint32_t graph_id);
|
void StoreRunGraphIdList(uint32_t graph_id);
|
||||||
|
|
||||||
// analyze tensors and wait for command
|
// analyze tensors and wait for command
|
||||||
|
@ -97,8 +103,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
|
|
||||||
static uint32_t GetRankID();
|
static uint32_t GetRankID();
|
||||||
|
|
||||||
void Dump(const KernelGraphPtr &kernel_graph) const;
|
|
||||||
|
|
||||||
void DumpConstantDataAscend(const KernelGraphPtr &graph);
|
void DumpConstantDataAscend(const KernelGraphPtr &graph);
|
||||||
|
|
||||||
void DumpSingleNode(const CNodePtr &node, uint32_t graph_id);
|
void DumpSingleNode(const CNodePtr &node, uint32_t graph_id);
|
||||||
|
@ -144,6 +148,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
|
|
||||||
void LoadParametersAndConst(const KernelGraphPtr &graph);
|
void LoadParametersAndConst(const KernelGraphPtr &graph);
|
||||||
|
|
||||||
|
void LoadParametersAllGraphs();
|
||||||
|
|
||||||
|
void LoadConstsForGraph(const KernelGraphPtr &graph);
|
||||||
|
|
||||||
|
void DumpParamsAndConstAndHistory();
|
||||||
|
|
||||||
void UpdateStepNum(const session::KernelGraph *graph);
|
void UpdateStepNum(const session::KernelGraph *graph);
|
||||||
|
|
||||||
void UpdateStepNumGPU();
|
void UpdateStepNumGPU();
|
||||||
|
@ -162,6 +172,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
|
|
||||||
uint32_t GetPrevRootGraphId() const { return prev_root_graph_id_; }
|
uint32_t GetPrevRootGraphId() const { return prev_root_graph_id_; }
|
||||||
|
|
||||||
|
std::vector<KernelGraphPtr> GetStepGraphPtrList() const { return graph_ptr_step_vec_; }
|
||||||
|
|
||||||
void SetGraphPtr(const KernelGraphPtr &graph_ptr) { graph_ptr_ = graph_ptr; }
|
void SetGraphPtr(const KernelGraphPtr &graph_ptr) { graph_ptr_ = graph_ptr; }
|
||||||
|
|
||||||
const KernelGraphPtr GetGraphPtr() const { return graph_ptr_; }
|
const KernelGraphPtr GetGraphPtr() const { return graph_ptr_; }
|
||||||
|
@ -180,6 +192,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
|
|
||||||
void UpdateGraphIterMap(uint32_t graph_id, int32_t iter_num);
|
void UpdateGraphIterMap(uint32_t graph_id, int32_t iter_num);
|
||||||
|
|
||||||
|
std::vector<AnfNodePtr> GetParametersMindRT() const { return parameters_mindRT_; }
|
||||||
|
|
||||||
#ifdef ENABLE_D
|
#ifdef ENABLE_D
|
||||||
std::shared_ptr<DumpDataBuilder> LoadDumpDataBuilder(const std::string &node_name);
|
std::shared_ptr<DumpDataBuilder> LoadDumpDataBuilder(const std::string &node_name);
|
||||||
|
|
||||||
|
@ -271,6 +285,8 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
|
|
||||||
void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id);
|
void LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output_index, uint32_t root_graph_id);
|
||||||
|
|
||||||
|
void LoadSingleParameterMindRT(const AnfNodePtr &anf_node);
|
||||||
|
|
||||||
// class members
|
// class members
|
||||||
|
|
||||||
std::unique_ptr<GrpcClient> grpc_client_;
|
std::unique_ptr<GrpcClient> grpc_client_;
|
||||||
|
@ -301,6 +317,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
||||||
std::list<KernelGraphPtr> graph_ptr_list_;
|
std::list<KernelGraphPtr> graph_ptr_list_;
|
||||||
// The vector of graph pointers that have been run in the current step.
|
// The vector of graph pointers that have been run in the current step.
|
||||||
std::vector<KernelGraphPtr> graph_ptr_step_vec_;
|
std::vector<KernelGraphPtr> graph_ptr_step_vec_;
|
||||||
|
// The vector of all the parameters for the current step for mindRT.
|
||||||
|
std::vector<AnfNodePtr> parameters_mindRT_;
|
||||||
|
std::vector<uint32_t> visited_root_graph_ids_;
|
||||||
|
|
||||||
// map to store iter num in each epoch when dataset_sink_mode is true
|
// map to store iter num in each epoch when dataset_sink_mode is true
|
||||||
std::map<uint32_t, int32_t> graph_iter_num_map_;
|
std::map<uint32_t, int32_t> graph_iter_num_map_;
|
||||||
|
|
|
@ -159,12 +159,18 @@ bool CheckReadData(const CNodePtr &cnode) {
|
||||||
return read_data;
|
return read_data;
|
||||||
}
|
}
|
||||||
|
|
||||||
|
bool IsDeviceTargetGPU() {
|
||||||
|
auto context = MsContext::GetInstance();
|
||||||
|
MS_EXCEPTION_IF_NULL(context);
|
||||||
|
return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
|
||||||
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
* Feature group: Dump, Online debugger.
|
* Feature group: Dump, Online debugger.
|
||||||
* Target device group: GPU.
|
* Target device group: Ascend, GPU.
|
||||||
* Runtime category: MindRT.
|
* Runtime category: MindRT.
|
||||||
* Description: Load inputs and outputs of the given node if needed and dump them if dump is enabled, then it performs
|
* Description: Load inputs and outputs of the given node if needed and dump them if dump is enabled, then it performs
|
||||||
* PostExecuteNode function on the given node.
|
* PostExecuteNode function on the given node for GPU.
|
||||||
*/
|
*/
|
||||||
void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
|
void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
|
||||||
const DeviceContext *device_context) {
|
const DeviceContext *device_context) {
|
||||||
|
@ -194,9 +200,11 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
|
||||||
debugger->ClearCurrentData();
|
debugger->ClearCurrentData();
|
||||||
}
|
}
|
||||||
}
|
}
|
||||||
// check if the node is last kernel
|
if (IsDeviceTargetGPU()) {
|
||||||
bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip");
|
// check if the node is last kernel
|
||||||
debugger->PostExecuteNode(cnode, last_kernel);
|
bool last_kernel = !AnfAlgo::IsInplaceNode(cnode, "skip");
|
||||||
|
debugger->PostExecuteNode(cnode, last_kernel);
|
||||||
|
}
|
||||||
}
|
}
|
||||||
|
|
||||||
/*
|
/*
|
||||||
|
@ -210,10 +218,7 @@ std::string CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr) {
|
||||||
std::string error_info = "";
|
std::string error_info = "";
|
||||||
bool sink_mode = ConfigManager::GetInstance().dataset_mode() || graph_ptr->IsDatasetGraph();
|
bool sink_mode = ConfigManager::GetInstance().dataset_mode() || graph_ptr->IsDatasetGraph();
|
||||||
auto debugger = Debugger::GetInstance();
|
auto debugger = Debugger::GetInstance();
|
||||||
auto context = MsContext::GetInstance();
|
if (debugger->CheckDebuggerDumpEnabled() && sink_mode && IsDeviceTargetGPU()) {
|
||||||
MS_EXCEPTION_IF_NULL(context);
|
|
||||||
bool is_gpu = (context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice);
|
|
||||||
if (debugger->CheckDebuggerDumpEnabled() && sink_mode && is_gpu) {
|
|
||||||
error_info = "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
|
error_info = "e2e_dump is not supported on GPU with dataset_sink_mode=True. Please set dataset_sink_mode=False";
|
||||||
}
|
}
|
||||||
if (debugger->CheckDebuggerEnabled() && sink_mode) {
|
if (debugger->CheckDebuggerEnabled() && sink_mode) {
|
||||||
|
@ -250,17 +255,9 @@ void LoadDataForDebugger(const KernelGraphPtr &graph_ptr) {
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
void DumpSetup(const KernelGraphPtr &graph) {
|
|
||||||
MS_LOG(DEBUG) << "Start!";
|
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
|
||||||
E2eDump::DumpSetup(graph.get());
|
|
||||||
MS_LOG(DEBUG) << "Finish!";
|
|
||||||
}
|
|
||||||
|
|
||||||
void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
|
void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
|
||||||
MS_LOG(DEBUG) << "Start!";
|
MS_LOG(DEBUG) << "Start!";
|
||||||
MS_EXCEPTION_IF_NULL(graph);
|
MS_EXCEPTION_IF_NULL(graph);
|
||||||
E2eDump::DumpRunIter(graph, rank_id);
|
|
||||||
E2eDump::DumpData(graph.get(), rank_id);
|
E2eDump::DumpData(graph.get(), rank_id);
|
||||||
MS_LOG(DEBUG) << "Finish!";
|
MS_LOG(DEBUG) << "Finish!";
|
||||||
}
|
}
|
||||||
|
@ -280,7 +277,6 @@ uint32_t GetRankID() {
|
||||||
void SuperKernelE2eDump(const KernelGraphPtr &graph) {
|
void SuperKernelE2eDump(const KernelGraphPtr &graph) {
|
||||||
#ifndef ENABLE_SECURITY
|
#ifndef ENABLE_SECURITY
|
||||||
Dump(graph, GetRankID());
|
Dump(graph, GetRankID());
|
||||||
DumpSetup(graph);
|
|
||||||
#endif
|
#endif
|
||||||
}
|
}
|
||||||
|
|
||||||
|
|
|
@ -294,7 +294,8 @@ void DataPrepareActor::PrepareData(const std::vector<std::vector<TensorPtr>> &in
|
||||||
|
|
||||||
void DataPrepareActor::SendDebugReq(OpContext<DeviceTensor> *const context) {
|
void DataPrepareActor::SendDebugReq(OpContext<DeviceTensor> *const context) {
|
||||||
ActorDispatcher::Send(*debug_aid_, &DebugActor::DebugOnStepBegin, graph_compiler_info_->graphs_,
|
ActorDispatcher::Send(*debug_aid_, &DebugActor::DebugOnStepBegin, graph_compiler_info_->graphs_,
|
||||||
graph_compiler_info_->device_contexts_, context, &GetAID());
|
graph_compiler_info_->origin_parameters_order_, graph_compiler_info_->device_contexts_, context,
|
||||||
|
&GetAID());
|
||||||
}
|
}
|
||||||
|
|
||||||
void DataPrepareActor::OnDebugFinish(OpContext<DeviceTensor> *const context) {
|
void DataPrepareActor::OnDebugFinish(OpContext<DeviceTensor> *const context) {
|
||||||
|
|
|
@ -128,7 +128,9 @@ void DebugActor::DebugForGraph(const KernelGraphPtr &graph, const DeviceContext
|
||||||
* Runtime category: MindRT.
|
* Runtime category: MindRT.
|
||||||
* Description: Checks dataset_sink_mode and generates the related error if any exist and calls PreExecuteGraphDebugger.
|
* Description: Checks dataset_sink_mode and generates the related error if any exist and calls PreExecuteGraphDebugger.
|
||||||
*/
|
*/
|
||||||
void DebugActor::DebugOnStepBegin(std::vector<KernelGraphPtr> graphs, std::vector<DeviceContext *> device_contexts,
|
void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
|
||||||
|
const std::vector<AnfNodePtr> &origin_parameters_order,
|
||||||
|
std::vector<DeviceContext *> device_contexts,
|
||||||
OpContext<DeviceTensor> *const op_context, const AID *from_aid) {
|
OpContext<DeviceTensor> *const op_context, const AID *from_aid) {
|
||||||
MS_EXCEPTION_IF_NULL(op_context);
|
MS_EXCEPTION_IF_NULL(op_context);
|
||||||
MS_EXCEPTION_IF_NULL(from_aid);
|
MS_EXCEPTION_IF_NULL(from_aid);
|
||||||
|
@ -144,7 +146,7 @@ void DebugActor::DebugOnStepBegin(std::vector<KernelGraphPtr> graphs, std::vecto
|
||||||
}
|
}
|
||||||
auto debugger = Debugger::GetInstance();
|
auto debugger = Debugger::GetInstance();
|
||||||
if (debugger != nullptr && debugger->DebuggerBackendEnabled()) {
|
if (debugger != nullptr && debugger->DebuggerBackendEnabled()) {
|
||||||
debugger->PreExecuteGraphDebugger(graphs);
|
debugger->PreExecuteGraphDebugger(graphs, origin_parameters_order);
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -43,8 +43,10 @@ class DebugActor : public ActorBase {
|
||||||
OpContext<DeviceTensor> *const op_context, const AID *from_aid);
|
OpContext<DeviceTensor> *const op_context, const AID *from_aid);
|
||||||
|
|
||||||
// The debug on step begin.
|
// The debug on step begin.
|
||||||
void DebugOnStepBegin(std::vector<KernelGraphPtr> graphs, std::vector<DeviceContext *> device_contexts,
|
void DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
|
||||||
OpContext<DeviceTensor> *const op_context, const AID *from_aid);
|
const std::vector<AnfNodePtr> &origin_parameters_order,
|
||||||
|
std::vector<DeviceContext *> device_contexts, OpContext<DeviceTensor> *const op_context,
|
||||||
|
const AID *from_aid);
|
||||||
|
|
||||||
// The debug on step end.
|
// The debug on step end.
|
||||||
void DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const AID *from_aid);
|
void DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const AID *from_aid);
|
||||||
|
|
|
@ -278,6 +278,16 @@ bool AscendDeviceContext::IsGraphMode() {
|
||||||
}
|
}
|
||||||
|
|
||||||
void AscendDeviceContext::Destroy() {
|
void AscendDeviceContext::Destroy() {
|
||||||
|
#ifdef ENABLE_DEBUGGER
|
||||||
|
auto debugger = Debugger::GetInstance();
|
||||||
|
if (debugger && debugger->debugger_enabled()) {
|
||||||
|
debugger->SetTrainingDone(true);
|
||||||
|
bool ret = debugger->SendMetadata(false);
|
||||||
|
if (!ret) {
|
||||||
|
MS_LOG(ERROR) << "Failed to SendMetadata when finalize";
|
||||||
|
}
|
||||||
|
}
|
||||||
|
#endif
|
||||||
MS_LOG(INFO) << "Status record: Enter Destroy...";
|
MS_LOG(INFO) << "Status record: Enter Destroy...";
|
||||||
if (!initialized_) {
|
if (!initialized_) {
|
||||||
return;
|
return;
|
||||||
|
|
|
@ -153,7 +153,10 @@ void GPUDeviceContext::Destroy() {
|
||||||
auto debugger = Debugger::GetInstance();
|
auto debugger = Debugger::GetInstance();
|
||||||
if (debugger && debugger->debugger_enabled()) {
|
if (debugger && debugger->debugger_enabled()) {
|
||||||
debugger->SetTrainingDone(true);
|
debugger->SetTrainingDone(true);
|
||||||
debugger->SendMetadata(false);
|
bool ret = debugger->SendMetadata(false);
|
||||||
|
if (!ret) {
|
||||||
|
MS_LOG(ERROR) << "Failed to SendMetadata when finalize";
|
||||||
|
}
|
||||||
}
|
}
|
||||||
#endif
|
#endif
|
||||||
|
|
||||||
|
|
|
@ -101,7 +101,7 @@ def run_e2e_dump():
|
||||||
add = Net()
|
add = Net()
|
||||||
add(Tensor(x), Tensor(y))
|
add(Tensor(x), Tensor(y))
|
||||||
if context.get_context("device_target") == "Ascend":
|
if context.get_context("device_target") == "Ascend":
|
||||||
assert len(os.listdir(dump_file_path)) == 5
|
assert len(os.listdir(dump_file_path)) == 3
|
||||||
output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
|
output_name = "Add.Add-op*.0.0.*.output.0.DefaultFormat.npy"
|
||||||
elif context.get_context("device_target") == "CPU":
|
elif context.get_context("device_target") == "CPU":
|
||||||
assert len(os.listdir(dump_file_path)) == 5
|
assert len(os.listdir(dump_file_path)) == 5
|
||||||
|
@ -271,7 +271,7 @@ def test_dump_with_diagnostic_path():
|
||||||
shutil.rmtree(diagnose_path)
|
shutil.rmtree(diagnose_path)
|
||||||
add = Net()
|
add = Net()
|
||||||
add(Tensor(x), Tensor(y))
|
add(Tensor(x), Tensor(y))
|
||||||
assert len(os.listdir(dump_file_path)) == 5
|
assert len(os.listdir(dump_file_path)) == 3
|
||||||
del os.environ['MINDSPORE_DUMP_CONFIG']
|
del os.environ['MINDSPORE_DUMP_CONFIG']
|
||||||
del os.environ['MS_DIAGNOSTIC_DATA_PATH']
|
del os.environ['MS_DIAGNOSTIC_DATA_PATH']
|
||||||
|
|
||||||
|
|
Loading…
Reference in New Issue