!22333 Add tensor base and stat to online debugger

Merge pull request !22333 from parastooashtari/tensor_level_info_online
This commit is contained in:
i-robot 2021-08-26 13:18:32 +00:00 committed by Gitee
commit 158536b9e2
7 changed files with 251 additions and 9 deletions

View File

@ -193,18 +193,26 @@ void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end,
auto wp = std::get<1>(w_table_item);
// check ONLY init conditions on initial suspended state.
// skip other conditions on initial suspended state
if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
if (init_dbg_suspend && (wp.condition.type != INIT)) {
continue;
}
// skip init condition if not init suspend
if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
if ((wp.condition.type == INIT) && !init_dbg_suspend) {
continue;
}
// check change conditions only on step end.
if (wp.change_condition() && !step_end) continue;
if (wp.change_condition() && !step_end) {
continue;
}
// if recheck, ignore the cache results and reanalyze everything.
// if not a recheck, check only unanalyzed tensors
if (!recheck) {
wp_lock_.lock();
bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
wp_lock_.unlock();
if (wp_cache_hit) continue;
if (wp_cache_hit) {
continue;
}
}
std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
if (!found.empty()) {
@ -258,7 +266,9 @@ void DebugServices::CheckWatchpointsForTensor(
const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
const auto tensor_slot = std::to_string(tensor->GetSlot());
// no elements to analyze
if (tensor->GetByteSize() == 0) continue;
if (tensor->GetByteSize() == 0) {
continue;
}
(*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
int tensor_dtype = tensor->GetType();
std::vector<watchpoint_t> watchpoints_to_check;
@ -269,7 +279,9 @@ void DebugServices::CheckWatchpointsForTensor(
AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
&previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
// no wp set on current tensor
if (watchpoints_to_check.empty()) continue;
if (watchpoints_to_check.empty()) {
continue;
}
uint32_t num_elements = tensor->GetNumElements();
#ifdef OFFLINE_DBG_MODE
void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
@ -1032,6 +1044,15 @@ void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::
}
}
void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
if (!result_list) {
MS_LOG(DEBUG) << "result_list is nullptr.";
return;
}
tensor_loader_->SearchTensors(name, result_list);
}
#ifdef ONLINE_DBG_MODE
bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
bool ret = false;

View File

@ -186,6 +186,15 @@ class DebugServices {
}
};
struct TensorBase {
TensorBase(uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
: data_size(data_size), dtype(dtype), shape(shape) {}
TensorBase() = default;
uint64_t data_size = 0;
int dtype = 0;
std::vector<int64_t> shape;
};
struct TensorStat {
TensorStat(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value,
double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count,
@ -313,6 +322,9 @@ class DebugServices {
void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *const shape);
void SearchNodesTensors(const std::vector<std::string> &name,
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list);
#ifdef ONLINE_DBG_MODE
bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;

View File

@ -1,5 +1,5 @@
/**
* Copyright 2019 Huawei Technologies Co., Ltd
* Copyright 2019-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -25,6 +25,8 @@ service EventListener {
rpc SendMetadata (Metadata) returns (EventReply) {};
rpc SendGraph (stream Chunk) returns (EventReply) {};
rpc SendTensors (stream TensorProto) returns (EventReply) {};
rpc SendTensorBase (TensorBase) returns (EventReply) {};
rpc SendTensorStats (TensorSummary) returns (EventReply) {};
rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
rpc SendMultiGraphs (stream Chunk) returns (EventReply) {};
rpc SendHeartbeat (Heartbeat) returns (EventReply) {};
@ -87,6 +89,12 @@ message SetCMD {
message ViewCMD {
repeated TensorProto tensors = 1;
enum Level {
value = 0;
statistics = 1;
base = 2;
}
Level level = 2;
}
message WatchCondition {
@ -142,3 +150,28 @@ message Heartbeat {
string message = 1;
int32 period = 2;
}
message TensorSummary{
TensorBase tensor_base = 1;
Statistics statistics = 2;
}
message Statistics {
bool is_bool = 1;
float max_value = 2;
float min_value = 3;
float avg_value = 4;
int32 count = 5;
int32 neg_zero_count = 6;
int32 pos_zero_count = 7;
int32 nan_count = 8;
int32 neg_inf_count = 9;
int32 pos_inf_count = 10;
int32 zero_count = 11;
}
message TensorBase{
int32 data_type = 1;
repeated int64 shape = 2;
int64 data_size = 3;
}

View File

@ -48,6 +48,7 @@ using debugger::Chunk;
using debugger::EventReply;
using debugger::GraphProto;
using debugger::ModelProto;
using debugger::Statistics;
using debugger::TensorProto;
using debugger::WatchCondition;
using debugger::WatchCondition_Condition_inf;
@ -839,6 +840,29 @@ void Debugger::ProcessKViewCMD(const EventReply &reply) {
MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
}
switch (reply.view_cmd().level()) {
case debugger::ViewCMD_Level::ViewCMD_Level_base:
MS_LOG(INFO) << "Tensor base request.";
ViewBaseLevel(reply);
break;
case debugger::ViewCMD_Level::ViewCMD_Level_statistics:
MS_LOG(INFO) << "Tensor statistics request.";
ViewStatLevel(reply);
break;
case debugger::ViewCMD_Level::ViewCMD_Level_value:
MS_LOG(INFO) << "Tensor value request.";
ViewValueLevel(reply);
break;
default:
MS_LOG(DEBUG) << "Debug: Unknown tensor info level";
break;
}
}
void Debugger::ViewValueLevel(const EventReply &reply) {
MS_LOG(INFO) << "Sending tensors";
std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
// print view cmd reply
@ -860,6 +884,30 @@ void Debugger::ProcessKViewCMD(const EventReply &reply) {
}
}
void Debugger::ViewStatLevel(const EventReply &reply) {
std::list<TensorSummary> tensor_stat_list = LoadTensorsStat(GetTensors(reply));
int index = 0;
for (auto tensor_stat : tensor_stat_list) {
EventReply send_tensors_stat_reply = grpc_client_->SendTensorStats(tensor_stat);
if (send_tensors_stat_reply.status() != debugger::EventReply::OK) {
MS_LOG(ERROR) << "Error: SendTensorsStats failed for tensor index " << index << ".";
}
index++;
}
}
void Debugger::ViewBaseLevel(const EventReply &reply) {
std::list<TensorBase> tensors_base_list = LoadTensorsBase(GetTensors(reply));
int index = 0;
for (auto tensor_base : tensors_base_list) {
EventReply send_tensors_base_reply = grpc_client_->SendTensorBase(tensor_base);
if (send_tensors_base_reply.status() != debugger::EventReply::OK) {
MS_LOG(ERROR) << "Error: SendTensorsBase failed for tensor index " << index << ".";
}
index++;
}
}
void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
tensor_item->set_node_name(tensor.node_name());
tensor_item->set_slot(tensor.slot());
@ -870,6 +918,35 @@ void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
tensor_item->clear_dims();
}
void AddTensorStatInfo(const DebugServices::TensorStat &tensor_stat, std::list<TensorSummary> *tensor_summary_list) {
if (!tensor_summary_list) {
MS_LOG(DEBUG) << "tensor_summary_list is nullptr.";
return;
}
TensorSummary tensor_summary_item;
TensorBase *tensor_base = tensor_summary_item.mutable_tensor_base();
tensor_base->set_data_type(tensor_stat.dtype);
tensor_base->set_data_size(tensor_stat.data_size);
for (auto elem : tensor_stat.shape) {
tensor_base->add_shape(elem);
}
Statistics *tensor_statistics = tensor_summary_item.mutable_statistics();
tensor_statistics->set_is_bool(tensor_stat.is_bool);
tensor_statistics->set_max_value(tensor_stat.max_value);
tensor_statistics->set_min_value(tensor_stat.min_value);
tensor_statistics->set_avg_value(tensor_stat.avg_value);
tensor_statistics->set_count(tensor_stat.count);
tensor_statistics->set_neg_zero_count(tensor_stat.neg_zero_count);
tensor_statistics->set_pos_zero_count(tensor_stat.pos_zero_count);
tensor_statistics->set_nan_count(tensor_stat.nan_count);
tensor_statistics->set_neg_inf_count(tensor_stat.neg_inf_count);
tensor_statistics->set_pos_inf_count(tensor_stat.pos_inf_count);
tensor_statistics->set_zero_count(tensor_stat.zero_count);
tensor_summary_list->push_back(tensor_summary_item);
}
void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
const ProtoVector<WatchCondition_Parameter> &parameters) {
std::vector<std::tuple<std::string, bool>> check_node_list;
@ -945,6 +1022,56 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
return tensor_list;
}
std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const {
std::list<TensorBase> tensor_base_list;
std::vector<std::string> name;
std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
debug_services_->SearchNodesTensors(name, &result_list);
for (auto result : result_list) {
auto tensor = std::get<1>(result);
if (!tensor) {
// tensor was not found, creating empty tensor base.
TensorBase tensor_base_item;
tensor_base_item.set_data_size(0);
tensor_base_item.set_data_type(0);
tensor_base_item.add_shape(0);
tensor_base_list.push_back(tensor_base_item);
continue;
}
// tensor was found creating tensor base object.
TensorBase tensor_base_item;
tensor_base_item.set_data_size(tensor->GetByteSize());
tensor_base_item.set_data_type(tensor->GetType());
for (auto elem : tensor->GetShape()) {
tensor_base_item.add_shape(elem);
}
tensor_base_list.push_back(tensor_base_item);
}
return tensor_base_list;
}
std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const {
std::list<TensorSummary> tensor_summary_list;
std::vector<std::string> name;
std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
debug_services_->SearchNodesTensors(name, &result_list);
for (auto result : result_list) {
auto tensor = std::get<1>(result);
if (!tensor) {
// tensor was not found, creating empty tensor summary.
DebugServices::TensorStat tensor_stat;
AddTensorStatInfo(tensor_stat, &tensor_summary_list);
continue;
}
// tensor was found creating tensor summary object.
DebugServices::TensorStat tensor_stat = debug_services_->GetTensorStatistics(tensor);
AddTensorStatInfo(tensor_stat, &tensor_summary_list);
}
return tensor_summary_list;
}
void Debugger::Exit() {
// clear resource before exit
// debugger will notify main thread to exit because main thread can only exit at step boundary.

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -32,6 +32,7 @@ using debugger::DataType;
using debugger::EventReply;
using debugger::GraphProto;
using debugger::ModelProto;
using debugger::Statistics;
using debugger::TensorProto;
using debugger::WatchCondition;
using debugger::WatchCondition_Parameter;
@ -216,6 +217,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void ProcessKSetCMD(const EventReply &reply);
// Process the KViewCMD
void ProcessKViewCMD(const EventReply &reply);
// ViewCMD base level
void ViewBaseLevel(const EventReply &reply);
// ViewCMD statistics level
void ViewStatLevel(const EventReply &reply);
// ViewCMD value level
void ViewValueLevel(const EventReply &reply);
// set what nodes and conditions to watch
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
const ProtoVector<WatchCondition_Parameter> &parameters);
@ -226,6 +233,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
// load tensor for view command
std::list<TensorProto> LoadTensors(const ProtoVector<TensorProto> &tensors) const;
// load tensor base for view command
std::list<TensorBase> LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const;
// load tensor statistics for view command
std::list<TensorSummary> LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const;
// terminate training process
void Exit();

View File

@ -26,7 +26,9 @@ using debugger::EventReply_Status_FAILED;
using debugger::GraphProto;
using debugger::Heartbeat;
using debugger::Metadata;
using debugger::TensorBase;
using debugger::TensorProto;
using debugger::TensorSummary;
using debugger::WatchpointHit;
namespace mindspore {
@ -200,4 +202,32 @@ EventReply GrpcClient::SendHeartbeat(const Heartbeat &heartbeat) {
}
return reply;
}
EventReply GrpcClient::SendTensorBase(const TensorBase &tensor_base) {
EventReply reply;
grpc::ClientContext context;
grpc::Status status = stub_->SendTensorBase(&context, tensor_base, &reply);
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendTensorBase";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}
EventReply GrpcClient::SendTensorStats(const TensorSummary &tensor_summary) {
EventReply reply;
grpc::ClientContext context;
grpc::Status status = stub_->SendTensorStats(&context, tensor_summary, &reply);
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendTensorStats";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
reply.set_status(EventReply_Status_FAILED);
}
return reply;
}
} // namespace mindspore

View File

@ -1,5 +1,5 @@
/**
* Copyright 2020 Huawei Technologies Co., Ltd
* Copyright 2020-2021 Huawei Technologies Co., Ltd
*
* Licensed under the Apache License, Version 2.0 (the "License");
* you may not use this file except in compliance with the License.
@ -29,7 +29,9 @@ using debugger::EventReply;
using debugger::GraphProto;
using debugger::Heartbeat;
using debugger::Metadata;
using debugger::TensorBase;
using debugger::TensorProto;
using debugger::TensorSummary;
using debugger::WatchpointHit;
namespace mindspore {
@ -55,6 +57,10 @@ class GrpcClient {
EventReply SendTensors(const std::list<TensorProto> &tensors);
EventReply SendTensorBase(const TensorBase &tensor_base);
EventReply SendTensorStats(const TensorSummary &tensor_summary);
EventReply SendMultiGraphs(const std::list<Chunk> &chunks);
EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints);