!22333 Add tensor base and stat to online debugger
Merge pull request !22333 from parastooashtari/tensor_level_info_online
This commit is contained in:
commit
158536b9e2
|
@ -193,18 +193,26 @@ void DebugServices::AddWatchPointsToCheck(bool init_dbg_suspend, bool step_end,
|
|||
auto wp = std::get<1>(w_table_item);
|
||||
// check ONLY init conditions on initial suspended state.
|
||||
// skip other conditions on initial suspended state
|
||||
if (init_dbg_suspend && (wp.condition.type != INIT)) continue;
|
||||
if (init_dbg_suspend && (wp.condition.type != INIT)) {
|
||||
continue;
|
||||
}
|
||||
// skip init condition if not init suspend
|
||||
if ((wp.condition.type == INIT) && !init_dbg_suspend) continue;
|
||||
if ((wp.condition.type == INIT) && !init_dbg_suspend) {
|
||||
continue;
|
||||
}
|
||||
// check change conditions only on step end.
|
||||
if (wp.change_condition() && !step_end) continue;
|
||||
if (wp.change_condition() && !step_end) {
|
||||
continue;
|
||||
}
|
||||
// if recheck, ignore the cache results and reanalyze everything.
|
||||
// if not a recheck, check only unanalyzed tensors
|
||||
if (!recheck) {
|
||||
wp_lock_.lock();
|
||||
bool wp_cache_hit = wp_id_cache_[tensor_name].count(wp.id);
|
||||
wp_lock_.unlock();
|
||||
if (wp_cache_hit) continue;
|
||||
if (wp_cache_hit) {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
std::string found = wp.FindQualifiedTensorName(tensor_name_no_slot);
|
||||
if (!found.empty()) {
|
||||
|
@ -258,7 +266,9 @@ void DebugServices::CheckWatchpointsForTensor(
|
|||
const auto tensor_name_no_slot = tensor_name.substr(0, tensor_name.find_first_of(':'));
|
||||
const auto tensor_slot = std::to_string(tensor->GetSlot());
|
||||
// no elements to analyze
|
||||
if (tensor->GetByteSize() == 0) continue;
|
||||
if (tensor->GetByteSize() == 0) {
|
||||
continue;
|
||||
}
|
||||
(*chunk_tensor_byte_size)[chunk_id] += tensor->GetByteSize();
|
||||
int tensor_dtype = tensor->GetType();
|
||||
std::vector<watchpoint_t> watchpoints_to_check;
|
||||
|
@ -269,7 +279,9 @@ void DebugServices::CheckWatchpointsForTensor(
|
|||
AddWatchPointsToCheck(init_dbg_suspend, step_end, recheck, tensor_name, tensor_name_no_slot,
|
||||
&previous_iter_tensor_needed, &qualified_tensor_name, &watchpoints_to_check);
|
||||
// no wp set on current tensor
|
||||
if (watchpoints_to_check.empty()) continue;
|
||||
if (watchpoints_to_check.empty()) {
|
||||
continue;
|
||||
}
|
||||
uint32_t num_elements = tensor->GetNumElements();
|
||||
#ifdef OFFLINE_DBG_MODE
|
||||
void *previous_tensor_ptr = GetPrevTensor(tensor, previous_iter_tensor_needed);
|
||||
|
@ -1032,6 +1044,15 @@ void DebugServices::ReadNodesTensors(const std::vector<std::string> &name, std::
|
|||
}
|
||||
}
|
||||
|
||||
void DebugServices::SearchNodesTensors(const std::vector<std::string> &name,
|
||||
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list) {
|
||||
if (!result_list) {
|
||||
MS_LOG(DEBUG) << "result_list is nullptr.";
|
||||
return;
|
||||
}
|
||||
tensor_loader_->SearchTensors(name, result_list);
|
||||
}
|
||||
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
bool DebugServices::IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel) const {
|
||||
bool ret = false;
|
||||
|
|
|
@ -186,6 +186,15 @@ class DebugServices {
|
|||
}
|
||||
};
|
||||
|
||||
struct TensorBase {
|
||||
TensorBase(uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
|
||||
: data_size(data_size), dtype(dtype), shape(shape) {}
|
||||
TensorBase() = default;
|
||||
uint64_t data_size = 0;
|
||||
int dtype = 0;
|
||||
std::vector<int64_t> shape;
|
||||
};
|
||||
|
||||
struct TensorStat {
|
||||
TensorStat(uint64_t data_size, int dtype, const std::vector<int64_t> &shape, bool is_bool, double max_value,
|
||||
double min_value, double avg_value, int count, int neg_zero_count, int pos_zero_count, int nan_count,
|
||||
|
@ -313,6 +322,9 @@ class DebugServices {
|
|||
void ReadNodesTensors(const std::vector<std::string> &name, std::vector<std::string> *ret_name,
|
||||
std::vector<char *> *data_ptr, std::vector<ssize_t> *data_size,
|
||||
std::vector<unsigned int> *dtype, std::vector<std::vector<int64_t>> *const shape);
|
||||
|
||||
void SearchNodesTensors(const std::vector<std::string> &name,
|
||||
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> *result_list);
|
||||
#ifdef ONLINE_DBG_MODE
|
||||
bool IsWatchPoint(const std::string &kernel_name, const CNodePtr &kernel = nullptr) const;
|
||||
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2019 Huawei Technologies Co., Ltd
|
||||
* Copyright 2019-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -25,6 +25,8 @@ service EventListener {
|
|||
rpc SendMetadata (Metadata) returns (EventReply) {};
|
||||
rpc SendGraph (stream Chunk) returns (EventReply) {};
|
||||
rpc SendTensors (stream TensorProto) returns (EventReply) {};
|
||||
rpc SendTensorBase (TensorBase) returns (EventReply) {};
|
||||
rpc SendTensorStats (TensorSummary) returns (EventReply) {};
|
||||
rpc SendWatchpointHits (stream WatchpointHit) returns (EventReply) {};
|
||||
rpc SendMultiGraphs (stream Chunk) returns (EventReply) {};
|
||||
rpc SendHeartbeat (Heartbeat) returns (EventReply) {};
|
||||
|
@ -87,6 +89,12 @@ message SetCMD {
|
|||
|
||||
message ViewCMD {
|
||||
repeated TensorProto tensors = 1;
|
||||
enum Level {
|
||||
value = 0;
|
||||
statistics = 1;
|
||||
base = 2;
|
||||
}
|
||||
Level level = 2;
|
||||
}
|
||||
|
||||
message WatchCondition {
|
||||
|
@ -142,3 +150,28 @@ message Heartbeat {
|
|||
string message = 1;
|
||||
int32 period = 2;
|
||||
}
|
||||
|
||||
message TensorSummary{
|
||||
TensorBase tensor_base = 1;
|
||||
Statistics statistics = 2;
|
||||
}
|
||||
|
||||
message Statistics {
|
||||
bool is_bool = 1;
|
||||
float max_value = 2;
|
||||
float min_value = 3;
|
||||
float avg_value = 4;
|
||||
int32 count = 5;
|
||||
int32 neg_zero_count = 6;
|
||||
int32 pos_zero_count = 7;
|
||||
int32 nan_count = 8;
|
||||
int32 neg_inf_count = 9;
|
||||
int32 pos_inf_count = 10;
|
||||
int32 zero_count = 11;
|
||||
}
|
||||
|
||||
message TensorBase{
|
||||
int32 data_type = 1;
|
||||
repeated int64 shape = 2;
|
||||
int64 data_size = 3;
|
||||
}
|
|
@ -48,6 +48,7 @@ using debugger::Chunk;
|
|||
using debugger::EventReply;
|
||||
using debugger::GraphProto;
|
||||
using debugger::ModelProto;
|
||||
using debugger::Statistics;
|
||||
using debugger::TensorProto;
|
||||
using debugger::WatchCondition;
|
||||
using debugger::WatchCondition_Condition_inf;
|
||||
|
@ -839,6 +840,29 @@ void Debugger::ProcessKViewCMD(const EventReply &reply) {
|
|||
MS_LOG(INFO) << "tensor iter: " << received_tensor.iter();
|
||||
MS_LOG(INFO) << "tensor truncate: " << std::boolalpha << received_tensor.truncate() << std::noboolalpha;
|
||||
}
|
||||
|
||||
switch (reply.view_cmd().level()) {
|
||||
case debugger::ViewCMD_Level::ViewCMD_Level_base:
|
||||
MS_LOG(INFO) << "Tensor base request.";
|
||||
ViewBaseLevel(reply);
|
||||
break;
|
||||
|
||||
case debugger::ViewCMD_Level::ViewCMD_Level_statistics:
|
||||
MS_LOG(INFO) << "Tensor statistics request.";
|
||||
ViewStatLevel(reply);
|
||||
break;
|
||||
|
||||
case debugger::ViewCMD_Level::ViewCMD_Level_value:
|
||||
MS_LOG(INFO) << "Tensor value request.";
|
||||
ViewValueLevel(reply);
|
||||
break;
|
||||
default:
|
||||
MS_LOG(DEBUG) << "Debug: Unknown tensor info level";
|
||||
break;
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::ViewValueLevel(const EventReply &reply) {
|
||||
MS_LOG(INFO) << "Sending tensors";
|
||||
std::list<TensorProto> tensors = LoadTensors(GetTensors(reply));
|
||||
// print view cmd reply
|
||||
|
@ -860,6 +884,30 @@ void Debugger::ProcessKViewCMD(const EventReply &reply) {
|
|||
}
|
||||
}
|
||||
|
||||
void Debugger::ViewStatLevel(const EventReply &reply) {
|
||||
std::list<TensorSummary> tensor_stat_list = LoadTensorsStat(GetTensors(reply));
|
||||
int index = 0;
|
||||
for (auto tensor_stat : tensor_stat_list) {
|
||||
EventReply send_tensors_stat_reply = grpc_client_->SendTensorStats(tensor_stat);
|
||||
if (send_tensors_stat_reply.status() != debugger::EventReply::OK) {
|
||||
MS_LOG(ERROR) << "Error: SendTensorsStats failed for tensor index " << index << ".";
|
||||
}
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::ViewBaseLevel(const EventReply &reply) {
|
||||
std::list<TensorBase> tensors_base_list = LoadTensorsBase(GetTensors(reply));
|
||||
int index = 0;
|
||||
for (auto tensor_base : tensors_base_list) {
|
||||
EventReply send_tensors_base_reply = grpc_client_->SendTensorBase(tensor_base);
|
||||
if (send_tensors_base_reply.status() != debugger::EventReply::OK) {
|
||||
MS_LOG(ERROR) << "Error: SendTensorsBase failed for tensor index " << index << ".";
|
||||
}
|
||||
index++;
|
||||
}
|
||||
}
|
||||
|
||||
void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
|
||||
tensor_item->set_node_name(tensor.node_name());
|
||||
tensor_item->set_slot(tensor.slot());
|
||||
|
@ -870,6 +918,35 @@ void AddTensorProtoInfo(TensorProto *tensor_item, const TensorProto &tensor) {
|
|||
tensor_item->clear_dims();
|
||||
}
|
||||
|
||||
void AddTensorStatInfo(const DebugServices::TensorStat &tensor_stat, std::list<TensorSummary> *tensor_summary_list) {
|
||||
if (!tensor_summary_list) {
|
||||
MS_LOG(DEBUG) << "tensor_summary_list is nullptr.";
|
||||
return;
|
||||
}
|
||||
TensorSummary tensor_summary_item;
|
||||
TensorBase *tensor_base = tensor_summary_item.mutable_tensor_base();
|
||||
tensor_base->set_data_type(tensor_stat.dtype);
|
||||
tensor_base->set_data_size(tensor_stat.data_size);
|
||||
for (auto elem : tensor_stat.shape) {
|
||||
tensor_base->add_shape(elem);
|
||||
}
|
||||
|
||||
Statistics *tensor_statistics = tensor_summary_item.mutable_statistics();
|
||||
tensor_statistics->set_is_bool(tensor_stat.is_bool);
|
||||
tensor_statistics->set_max_value(tensor_stat.max_value);
|
||||
tensor_statistics->set_min_value(tensor_stat.min_value);
|
||||
tensor_statistics->set_avg_value(tensor_stat.avg_value);
|
||||
tensor_statistics->set_count(tensor_stat.count);
|
||||
tensor_statistics->set_neg_zero_count(tensor_stat.neg_zero_count);
|
||||
tensor_statistics->set_pos_zero_count(tensor_stat.pos_zero_count);
|
||||
tensor_statistics->set_nan_count(tensor_stat.nan_count);
|
||||
tensor_statistics->set_neg_inf_count(tensor_stat.neg_inf_count);
|
||||
tensor_statistics->set_pos_inf_count(tensor_stat.pos_inf_count);
|
||||
tensor_statistics->set_zero_count(tensor_stat.zero_count);
|
||||
|
||||
tensor_summary_list->push_back(tensor_summary_item);
|
||||
}
|
||||
|
||||
void Debugger::SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
|
||||
const ProtoVector<WatchCondition_Parameter> ¶meters) {
|
||||
std::vector<std::tuple<std::string, bool>> check_node_list;
|
||||
|
@ -945,6 +1022,56 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
|
|||
return tensor_list;
|
||||
}
|
||||
|
||||
std::list<TensorBase> Debugger::LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const {
|
||||
std::list<TensorBase> tensor_base_list;
|
||||
std::vector<std::string> name;
|
||||
std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
|
||||
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
|
||||
debug_services_->SearchNodesTensors(name, &result_list);
|
||||
for (auto result : result_list) {
|
||||
auto tensor = std::get<1>(result);
|
||||
if (!tensor) {
|
||||
// tensor was not found, creating empty tensor base.
|
||||
TensorBase tensor_base_item;
|
||||
tensor_base_item.set_data_size(0);
|
||||
tensor_base_item.set_data_type(0);
|
||||
tensor_base_item.add_shape(0);
|
||||
tensor_base_list.push_back(tensor_base_item);
|
||||
continue;
|
||||
}
|
||||
// tensor was found creating tensor base object.
|
||||
TensorBase tensor_base_item;
|
||||
tensor_base_item.set_data_size(tensor->GetByteSize());
|
||||
tensor_base_item.set_data_type(tensor->GetType());
|
||||
for (auto elem : tensor->GetShape()) {
|
||||
tensor_base_item.add_shape(elem);
|
||||
}
|
||||
tensor_base_list.push_back(tensor_base_item);
|
||||
}
|
||||
return tensor_base_list;
|
||||
}
|
||||
|
||||
std::list<TensorSummary> Debugger::LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const {
|
||||
std::list<TensorSummary> tensor_summary_list;
|
||||
std::vector<std::string> name;
|
||||
std::transform(tensors.begin(), tensors.end(), std::back_inserter(name), GetTensorFullName);
|
||||
std::vector<std::tuple<std::string, std::shared_ptr<TensorData>>> result_list;
|
||||
debug_services_->SearchNodesTensors(name, &result_list);
|
||||
for (auto result : result_list) {
|
||||
auto tensor = std::get<1>(result);
|
||||
if (!tensor) {
|
||||
// tensor was not found, creating empty tensor summary.
|
||||
DebugServices::TensorStat tensor_stat;
|
||||
AddTensorStatInfo(tensor_stat, &tensor_summary_list);
|
||||
continue;
|
||||
}
|
||||
// tensor was found creating tensor summary object.
|
||||
DebugServices::TensorStat tensor_stat = debug_services_->GetTensorStatistics(tensor);
|
||||
AddTensorStatInfo(tensor_stat, &tensor_summary_list);
|
||||
}
|
||||
return tensor_summary_list;
|
||||
}
|
||||
|
||||
void Debugger::Exit() {
|
||||
// clear resource before exit
|
||||
// debugger will notify main thread to exit because main thread can only exit at step boundary.
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -32,6 +32,7 @@ using debugger::DataType;
|
|||
using debugger::EventReply;
|
||||
using debugger::GraphProto;
|
||||
using debugger::ModelProto;
|
||||
using debugger::Statistics;
|
||||
using debugger::TensorProto;
|
||||
using debugger::WatchCondition;
|
||||
using debugger::WatchCondition_Parameter;
|
||||
|
@ -216,6 +217,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
void ProcessKSetCMD(const EventReply &reply);
|
||||
// Process the KViewCMD
|
||||
void ProcessKViewCMD(const EventReply &reply);
|
||||
// ViewCMD base level
|
||||
void ViewBaseLevel(const EventReply &reply);
|
||||
// ViewCMD statistics level
|
||||
void ViewStatLevel(const EventReply &reply);
|
||||
// ViewCMD value level
|
||||
void ViewValueLevel(const EventReply &reply);
|
||||
// set what nodes and conditions to watch
|
||||
void SetWatchpoint(const ProtoVector<WatchNode> &nodes, const WatchCondition &condition, const int32_t id,
|
||||
const ProtoVector<WatchCondition_Parameter> ¶meters);
|
||||
|
@ -226,6 +233,12 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
// load tensor for view command
|
||||
std::list<TensorProto> LoadTensors(const ProtoVector<TensorProto> &tensors) const;
|
||||
|
||||
// load tensor base for view command
|
||||
std::list<TensorBase> LoadTensorsBase(const ProtoVector<TensorProto> &tensors) const;
|
||||
|
||||
// load tensor statistics for view command
|
||||
std::list<TensorSummary> LoadTensorsStat(const ProtoVector<TensorProto> &tensors) const;
|
||||
|
||||
// terminate training process
|
||||
void Exit();
|
||||
|
||||
|
|
|
@ -26,7 +26,9 @@ using debugger::EventReply_Status_FAILED;
|
|||
using debugger::GraphProto;
|
||||
using debugger::Heartbeat;
|
||||
using debugger::Metadata;
|
||||
using debugger::TensorBase;
|
||||
using debugger::TensorProto;
|
||||
using debugger::TensorSummary;
|
||||
using debugger::WatchpointHit;
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -200,4 +202,32 @@ EventReply GrpcClient::SendHeartbeat(const Heartbeat &heartbeat) {
|
|||
}
|
||||
return reply;
|
||||
}
|
||||
|
||||
EventReply GrpcClient::SendTensorBase(const TensorBase &tensor_base) {
|
||||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
|
||||
grpc::Status status = stub_->SendTensorBase(&context, tensor_base, &reply);
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendTensorBase";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
reply.set_status(EventReply_Status_FAILED);
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
|
||||
EventReply GrpcClient::SendTensorStats(const TensorSummary &tensor_summary) {
|
||||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
|
||||
grpc::Status status = stub_->SendTensorStats(&context, tensor_summary, &reply);
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendTensorStats";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
reply.set_status(EventReply_Status_FAILED);
|
||||
}
|
||||
return reply;
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -1,5 +1,5 @@
|
|||
/**
|
||||
* Copyright 2020 Huawei Technologies Co., Ltd
|
||||
* Copyright 2020-2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
|
@ -29,7 +29,9 @@ using debugger::EventReply;
|
|||
using debugger::GraphProto;
|
||||
using debugger::Heartbeat;
|
||||
using debugger::Metadata;
|
||||
using debugger::TensorBase;
|
||||
using debugger::TensorProto;
|
||||
using debugger::TensorSummary;
|
||||
using debugger::WatchpointHit;
|
||||
|
||||
namespace mindspore {
|
||||
|
@ -55,6 +57,10 @@ class GrpcClient {
|
|||
|
||||
EventReply SendTensors(const std::list<TensorProto> &tensors);
|
||||
|
||||
EventReply SendTensorBase(const TensorBase &tensor_base);
|
||||
|
||||
EventReply SendTensorStats(const TensorSummary &tensor_summary);
|
||||
|
||||
EventReply SendMultiGraphs(const std::list<Chunk> &chunks);
|
||||
|
||||
EventReply SendWatchpointHits(const std::list<WatchpointHit> &watchpoints);
|
||||
|
|
Loading…
Reference in New Issue