!23732 Fix code self-check issues for online dbg in master

Merge pull request !23732 from TinaMengtingZhang/code_self_check_master
This commit is contained in:
i-robot 2021-09-18 03:37:58 +00:00 committed by Gitee
commit 38bab297ec
12 changed files with 44 additions and 100 deletions

View File

@ -1863,6 +1863,7 @@ void AscendSession::LoadGraphsToDbg(NotNull<KernelGraphPtr> graph,
MS_LOG(INFO) << "Start to do LoadGraphsToDbg in graph: " << graph->graph_id();
MS_EXCEPTION_IF_NULL(debugger_);
debugger_->LoadGraphs(graph);
MS_LOG(INFO) << "graph_sum_: " << graph_sum_;
for (auto &child_graph : graph->child_graph_order()) {

View File

@ -161,6 +161,7 @@ class SessionBasic : public std::enable_shared_from_this<SessionBasic> {
debugger_ = Debugger::GetInstance();
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
MS_EXCEPTION_IF_NULL(debugger_);
debugger_->Init(device_id_, ms_context->get_param<std::string>(MS_CTX_DEVICE_TARGET));
}
#endif

View File

@ -181,8 +181,8 @@ std::optional<std::string> Common::GetConfigFile(const std::string &env) {
if (env.empty()) {
MS_LOG(EXCEPTION) << "Invalid env";
}
auto config_path_str = std::getenv(env.c_str());
if (config_path_str == nullptr) {
auto config_path_str = common::GetEnv(env);
if (config_path_str.empty()) {
MS_LOG(ERROR) << "Please export env:" << env;
return std::nullopt;
}

View File

@ -130,7 +130,7 @@ void DumpJsonParser::CopyJsonToDir(uint32_t rank_id) {
}
auto dump_config_file = Common::GetConfigFile(kMindsporeDumpConfig);
if (!dump_config_file.has_value()) {
MS_LOG(EXCEPTION) << "Get dump config file failed";
MS_LOG(EXCEPTION) << "Get dump config file failed.";
}
std::ifstream json_file(dump_config_file.value());
if (async_dump_enabled_ || e2e_dump_enabled_) {

View File

@ -52,7 +52,6 @@ class DumpJsonParser {
std::string path() const { return path_; }
std::string iteration_string() const { return iteration_; }
std::string net_name() const { return net_name_; }
uint32_t input_output() const { return input_output_; }
uint32_t op_debug_mode() const { return op_debug_mode_; }
bool trans_flag() const { return trans_flag_; }
uint32_t cur_dump_iter() const { return cur_dump_iter_; }

View File

@ -411,14 +411,18 @@ void DebugServices::CheckWatchpoints(std::vector<std::string> *const name, std::
std::vector<unsigned int> *root_graph_id) {
std::lock_guard<std::mutex> lg(lock_);
auto t1 = std::chrono::high_resolution_clock::now();
if (watchpoint_table_.empty()) return;
if (watchpoint_table_.empty()) {
return;
}
// vector to store execution order of tensors hit
std::vector<int> exec_order;
std::vector<std::string> time_stamps;
int tensor_list_size = tensor_list->size();
uint64_t tensor_list_byte_size = 0;
MS_LOG(INFO) << "tensor list size: " << tensor_list_size;
if (tensor_list_size == 0) return;
if (tensor_list_size == 0) {
return;
}
// default value for number of threads
const int default_thread_num = 16;
int max_thread_num = default_thread_num;
@ -1166,7 +1170,7 @@ void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::
tensor_loader_->SearchTensors(name, &result_list);
for (auto result : result_list) {
if (!std::get<1>(result)) {
if (std::get<1>(result) == nullptr) {
continue;
}
ret_name->push_back(std::get<0>(result));
@ -1206,7 +1210,7 @@ bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr
}
bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const {
if (kernel && w_name.length() > 0) {
if (kernel != nullptr && w_name.length() > 0) {
auto input_size = AnfAlgo::GetInputTensorNum(kernel);
for (size_t j = 0; j < input_size; ++j) {
auto input_kernel = kernel->input(j + 1);
@ -1222,14 +1226,8 @@ bool DebugServices::IsWatchPointNodeInput(const std::string &w_name, const CNode
}
#endif
void DebugServices::EmptyTensor() { tensor_loader_->EmptyTensor(); }
std::vector<std::shared_ptr<TensorData>> DebugServices::GetTensor() const { return tensor_loader_->GetTensor(); }
uint32_t DebugServices::GetTensorLoaderIterNum() const { return tensor_loader_->GetIterNum(); }
void DebugServices::SetTensorLoaderIterNum(uint32_t iter_num) { tensor_loader_->set_iter_num(iter_num); }
void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
#ifdef ONLINE_DBG_MODE
@ -1246,10 +1244,6 @@ bool DebugServices::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, boo
return tensor_loader_->LoadNewTensor(tensor, keep_prev);
}
std::unordered_map<unsigned int, DebugServices::watchpoint_t> DebugServices::GetWatchpointTable() {
return watchpoint_table_;
}
void DebugServices::ResetLoadedTensors() {
wp_id_cache_.clear();
MS_LOG(INFO) << "Resetting loaded tensors";
@ -1269,7 +1263,9 @@ std::vector<std::shared_ptr<TensorData>> DebugServices::GetNodeTensor(const CNod
for (size_t j = 0; j < output_size; ++j) {
auto tensor_name_with_slot = kernel_name + ":" + std::to_string(j);
auto tensor = tensor_loader_->GetTensor(tensor_name_with_slot);
if (tensor) result.push_back(tensor);
if (tensor != nullptr) {
result.push_back(tensor);
}
}
return result;
}

View File

@ -90,7 +90,9 @@ class DebugServices {
bool hit;
double_t actual_value;
void Evaluate(double_t actualValue, std::string inequality_type) {
if (std::isnan(actualValue)) return;
if (std::isnan(actualValue)) {
return;
}
actual_value = actualValue;
// if cannot extract inequality type from watchpoint
@ -164,17 +166,6 @@ class DebugServices {
condition.type == SD_LT || condition.type == MAX_MIN_LT;
}
bool min_max_enabled() const {
return condition.type == MAX_LT || condition.type == MAX_GT || condition.type == MIN_LT ||
condition.type == MIN_GT || condition.type == MAX_MIN_LT || condition.type == MAX_MIN_GT ||
(condition.type == INIT && (!parameter_list[1].disabled || !parameter_list[2].disabled)) ||
(condition.type == TOO_LARGE && (!parameter_list[1].disabled || !parameter_list[2].disabled)) ||
(condition.type == TOO_SMALL && (!parameter_list[1].disabled || !parameter_list[2].disabled));
}
// inf or nan related condition set
bool inf_nan_enabled() const {
return condition.type == HAS_INF || condition.type == HAS_NAN || condition.type == GENERAL_OVERFLOW;
}
// mean or sd related condition set
bool mean_sd_enabled() const {
return condition.type == MEAN_LT || condition.type == MEAN_GT || condition.type == SD_LT ||
@ -185,7 +176,6 @@ class DebugServices {
return (condition.type == TOO_LARGE && !parameter_list[0].disabled) ||
(condition.type == TOO_SMALL && !parameter_list[0].disabled);
}
bool zero_percentage_enabled() const { return condition.type == ALL_ZERO || condition.type == INIT; }
bool tensor_update_ratio_mean_enabled() const {
return condition.type == CHANGE_TOO_LARGE || condition.type == CHANGE_TOO_SMALL;
@ -372,16 +362,11 @@ class DebugServices {
bool IsWatchPointNodeInput(const std::string &w_name, const CNodePtr &kernel) const;
#endif
void EmptyTensor();
std::vector<std::shared_ptr<TensorData>> GetTensor() const;
void AddAnalyzedTensorToCache(const bool recheck, const unsigned int id, const std::string &tensor_name);
uint32_t GetTensorLoaderIterNum() const;
void SetTensorLoaderIterNum(uint32_t iter_num);
void EmptyCurrentTensor();
#ifdef ONLINE_DBG_MODE
@ -392,8 +377,6 @@ class DebugServices {
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
std::unordered_map<unsigned int, watchpoint_t> GetWatchpointTable();
void ResetLoadedTensors();
#ifdef ONLINE_DBG_MODE
std::vector<std::shared_ptr<TensorData>> GetNodeTensor(const CNodePtr &kernel);

View File

@ -291,6 +291,7 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
}
}
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
MS_EXCEPTION_IF_NULL(graph_ptr);
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
CheckDatasetSinkMode();
@ -379,7 +380,7 @@ uint32_t Debugger::GetRankID() {
}
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
uint32_t rank_id = GetRankID();
if (debugger_->DebuggerBackendEnabled()) {
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
MS_EXCEPTION_IF_NULL(kernel_graph);
(void)E2eDump::DumpParametersAndConstData(kernel_graph.get(), rank_id, debugger_.get());
} else {
@ -388,7 +389,7 @@ void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
}
void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) {
if (debugger_->DebuggerBackendEnabled()) {
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
uint32_t rank_id = GetRankID();
(void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get());
}
@ -429,9 +430,11 @@ void Debugger::PostExecuteGraphDebugger() {
return;
}
// LoadParametersAndConst for all the graphs
if (debugger_) {
for (auto graph : graph_ptr_list_) {
debugger_->LoadParametersAndConst(graph);
}
}
// debug used for dump
if (debugger_ && debugger_->CheckDebuggerDumpEnabled()) {
// Dump Parameters and consts
@ -453,7 +456,7 @@ void Debugger::PostExecute() {
if (pipeline::GraphExecutorPy::GetDebugTerminate()) {
return;
}
if (debugger_->DebuggerBackendEnabled()) {
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
// analyze tensor data and send the watchpoints been hit
if (debugger_enabled_ && !is_dataset_graph_) {
if (device_target_ != kGPUDevice) {
@ -516,17 +519,8 @@ void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
}
}
void Debugger::PostDebugOp() {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
// suspend if debugger is enabled
if (debugger_enabled_ && !is_dataset_graph_) {
MS_LOG(INFO) << "Debugger suspend at debug_op";
CommandLoop();
}
}
void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
MS_EXCEPTION_IF_NULL(graph_ptr);
if (graph_ptr_ != graph_ptr) {
MS_LOG(INFO) << "LoadGraphs Debugger got new graph: " << graph_ptr->graph_id();
// save new graph_ptr
@ -547,6 +541,7 @@ void Debugger::LoadGraphs(const KernelGraphPtr &graph_ptr) {
// In single graph cases, check single graph ptr
void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
MS_EXCEPTION_IF_NULL(graph_ptr);
if (graph_ptr_ != graph_ptr) {
MS_LOG(INFO) << "CheckGraphPtr Debugger got new graph: " << graph_ptr->graph_id();
// save new graph_ptr
@ -566,6 +561,7 @@ void Debugger::CheckGraphPtr(const KernelGraphPtr &graph_ptr) {
void Debugger::CheckDatasetGraph() {
// print parameter node names
MS_EXCEPTION_IF_NULL(graph_ptr_);
const auto &params = graph_ptr_->inputs();
for (const auto &param : params) {
MS_LOG(INFO) << "param: " << GetKernelNodeName(param);
@ -602,6 +598,7 @@ void Debugger::SendHeartbeat(int32_t period) {
SetEnableHeartbeat(CheckDebuggerEnabled());
while (enable_heartbeat_) {
MS_EXCEPTION_IF_NULL(grpc_client_);
EventReply reply = grpc_client_->SendHeartbeat(heartbeat);
if (reply.status() != reply.OK) {
@ -624,6 +621,7 @@ void Debugger::SendHeartbeat(int32_t period) {
void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
if (SendMetadata(true)) {
// send graph to Mindinsight server
MS_EXCEPTION_IF_NULL(grpc_client_);
EventReply reply = grpc_client_->SendGraph(graph_proto);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendGraph failed";
@ -635,6 +633,7 @@ void Debugger::SendGraphAndSuspend(const GraphProto &graph_proto) {
bool Debugger::SendMetadata(bool version_check) {
// prepare metadata
MS_EXCEPTION_IF_NULL(graph_ptr_);
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
Metadata metadata;
metadata.set_device_name(device_name);
@ -647,6 +646,7 @@ bool Debugger::SendMetadata(bool version_check) {
// set graph munber to not_dataset_graph_sum_
metadata.set_graph_num(not_dataset_graph_sum_);
MS_EXCEPTION_IF_NULL(grpc_client_);
EventReply reply_metadata = grpc_client_->SendMetadata(metadata);
bool ret = false;
@ -681,6 +681,7 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot
if (!SendMetadata(true)) {
return;
}
MS_EXCEPTION_IF_NULL(grpc_client_);
// send multiple graphs to mindinght server
// split graph into chunks if one graph is larger than chunk size
std::list<Chunk> chunked_graph_proto_list;
@ -716,6 +717,7 @@ void Debugger::SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_prot
void Debugger::CommandLoop() {
// prepare metadata
MS_EXCEPTION_IF_NULL(graph_ptr_);
std::string device_name = std::to_string(device_id_) + ":" + std::to_string(graph_ptr_->graph_id());
Metadata metadata;
@ -732,6 +734,7 @@ void Debugger::CommandLoop() {
while (!run) {
// wait for command
MS_EXCEPTION_IF_NULL(grpc_client_);
EventReply reply = grpc_client_->WaitForCommand(metadata);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: WaitForCommand failed";
@ -885,6 +888,7 @@ void Debugger::ViewValueLevel(const EventReply &reply) {
}
MS_LOG(INFO) << "tensor dtype: " << tensor.data_type();
}
MS_EXCEPTION_IF_NULL(grpc_client_);
EventReply send_tensors_reply = grpc_client_->SendTensors(tensors);
if (send_tensors_reply.status() != debugger::EventReply::OK) {
MS_LOG(ERROR) << "Error: SendTensors failed";
@ -1127,6 +1131,7 @@ std::list<WatchpointHit> Debugger::CheckWatchpoints(const std::string &watchnode
void Debugger::SendWatchpoints(const std::list<WatchpointHit> &points) {
// send info about watchpoint
if (!points.empty()) {
MS_EXCEPTION_IF_NULL(grpc_client_);
EventReply reply = grpc_client_->SendWatchpointHits(points);
if (reply.status() != reply.OK) {
MS_LOG(ERROR) << "Error: SendWatchpointHits failed";
@ -1141,16 +1146,6 @@ bool Debugger::DumpTensorToFile(const std::string &tensor_name, bool trans_flag,
device_type, addr_format, slot);
}
bool Debugger::DebugServicesIsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
return debug_services_.get()->IsWatchPoint(kernel_name, kernel);
}
void Debugger::EmptyTensor() { debug_services_.get()->EmptyTensor(); }
void Debugger::SetTensorLoaderIterNum(uint32_t iter_num) { debug_services_.get()->SetTensorLoaderIterNum(iter_num); }
uint32_t Debugger::GetTensorLoaderIterNum() const { return debug_services_.get()->GetTensorLoaderIterNum(); }
bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
return debug_services_.get()->LoadNewTensor(tensor, keep_prev);
}
@ -1273,14 +1268,6 @@ void Debugger::SetCurNode(const std::string &cur_name) {
std::string Debugger::run_level() const { return run_level_; }
void Debugger::SetStepNum(int32_t cur_num_step) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
num_step_ = cur_num_step;
}
int32_t Debugger::step_num() const { return num_step_; }
void Debugger::SetTrainingDone(bool training_done) { training_done_ = training_done; }
bool Debugger::CheckPort(const std::string &port) const {
@ -1377,6 +1364,7 @@ void Debugger::LoadParametersAndConst() {
void Debugger::LoadParametersAndConst(const KernelGraphPtr &graph) {
if (!(debugger_enabled_ || CheckDebuggerDumpEnabled())) return;
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(graph_ptr_);
// load parameters
MS_LOG(INFO) << "Start to load Parameters for graph " << graph->graph_id() << ".";
const auto &parameters = graph_ptr_->inputs();
@ -1432,6 +1420,8 @@ void Debugger::LoadGraphOutputs() {
}
void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
MS_EXCEPTION_IF_NULL(graph);
MS_EXCEPTION_IF_NULL(debugger_);
// update step number if we are processing the first graph (to support multigraph)
if (device_target_ == kGPUDevice && (debugger_enabled_ || device::KernelRuntime::DumpDataEnabledIteration()) &&
(graph->graph_id() == debugger_->GetFirstRunGraphId())) {

View File

@ -102,21 +102,10 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);
// suspend the execution after a debug_op
void PostDebugOp();
bool DumpTensorToFile(const std::string &tensor_name, bool trans_flag, const std::string &filepath,
const std::string &host_fmt, const std::vector<int64_t> &host_shape, TypeId host_type,
TypeId device_type, const std::string &addr_format, size_t slot) const;
bool DebugServicesIsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;
void EmptyTensor();
void SetTensorLoaderIterNum(uint32_t iter_num);
uint32_t GetTensorLoaderIterNum() const;
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);
bool debugger_enabled() const;
@ -129,10 +118,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
std::string run_level() const;
void SetStepNum(int32_t cur_num_step);
int32_t step_num() const;
// check if any feature that uses the debugger backend is enabled
bool DebuggerBackendEnabled() const;
@ -291,8 +276,6 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
using DebuggerPtr = std::shared_ptr<Debugger>;
// get debugger ModelProto
std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph);
ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph);
// for getting proto DataType from Type of Tensor

View File

@ -506,11 +506,6 @@ void DebuggerProtoExporter::ExportValueNodes(const std::map<AnfNodePtr, size_t>
void DebuggerProtoExporter::InitModelInfo() { model_.set_ir_version(debugger::IR_VERSION); }
std::string GetDebuggerFuncGraphProtoString(const FuncGraphPtr &func_graph) {
DebuggerProtoExporter exporter;
return exporter.GetFuncGraphProtoString(func_graph);
}
debugger::ModelProto GetDebuggerFuncGraphProto(const FuncGraphPtr &func_graph) {
DebuggerProtoExporter exporter;
return exporter.GetFuncGraphProto(func_graph);

View File

@ -120,10 +120,6 @@ class TensorLoader {
return nullptr;
}
uint32_t GetIterNum() const { return iter_num_; }
std::map<std::string, std::shared_ptr<TensorData>> GetTensorMap() { return tensor_list_map_; }
std::shared_ptr<TensorData> GetPrevTensor(const std::string &tensor_name) {
if (tensor_list_map_.find(tensor_name + ":prev") != tensor_list_map_.end()) {
return tensor_list_map_[tensor_name + ":prev"];
@ -152,8 +148,6 @@ class TensorLoader {
void EmptyCurrentTensor() { tensor_list_map_.clear(); }
void set_iter_num(uint32_t iter_num) { this->iter_num_ = iter_num; }
bool EnableMemoryControl() { return mem_total_ > 0; }
void AppendToCacheEvictQueue(const std::string &tensor_name) {

View File

@ -792,6 +792,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
gpu_kernel->PostExecute();
}
#ifdef ENABLE_DEBUGGER
MS_EXCEPTION_IF_NULL(debugger_);
// called once per kernel to collect the outputs to the kernel (does a SyncDeviceToHost)
LoadKernelData(debugger_.get(), kernel, kernel_inputs, kernel_workspaces, kernel_outputs, exec_order, stream_,
dump_enabled, kernel == last_kernel);
@ -802,6 +803,7 @@ bool GPUKernelRuntime::LaunchKernelDynamic(const session::KernelGraph *graph, bo
if (!UpdateMemorySwapTask(kernel, mock, profiling)) {
#ifdef ENABLE_DEBUGGER
if (!mock) {
MS_EXCEPTION_IF_NULL(debugger_);
// invalidate current data collected by the debugger
debugger_->ClearCurrentData();
}