forked from mindspore-Ecosystem/mindspore
sync code self check from dev 1.1
This commit is contained in:
parent
180fd0d9f3
commit
f84b27b444
|
@ -294,6 +294,7 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
|
|||
DumpSetup(graph);
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
|
||||
MS_EXCEPTION_IF_NULL(graph_ptr);
|
||||
// access lock for public method
|
||||
|
@ -313,23 +314,7 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
|
|||
if (graph_proto_list_.size() > 1) {
|
||||
// there are more than one graphs are not dataset_graph
|
||||
if (not_dataset_graph_sum_ > 0) {
|
||||
// only try to enable debugger if they are not all dataset graphs
|
||||
if (!debugger_enabled_) {
|
||||
EnableDebugger();
|
||||
}
|
||||
if (debugger_enabled_) {
|
||||
// only send compiled graphs once at the initial step.
|
||||
auto dbg_graph_ptr = graph_ptr_;
|
||||
// use current graph ptr to load parameters
|
||||
graph_ptr_ = graph_ptr;
|
||||
LoadParametersAndConst();
|
||||
// revert graph ptr to original value
|
||||
graph_ptr_ = dbg_graph_ptr;
|
||||
|
||||
SendMultiGraphsAndSuspend(graph_proto_list_);
|
||||
|
||||
graph_proto_list_.clear();
|
||||
}
|
||||
SendMultiGraphsAndClear(graph_ptr);
|
||||
}
|
||||
} else if (graph_proto_list_.size() == 1) {
|
||||
// single graph, and not the initial step
|
||||
|
@ -359,6 +344,27 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
|
|||
// resets for the new graph
|
||||
suspended_at_last_kernel_ = false;
|
||||
}
|
||||
|
||||
void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
|
||||
// only try to enable debugger if they are not all dataset graphs
|
||||
if (!debugger_enabled_) {
|
||||
EnableDebugger();
|
||||
}
|
||||
if (debugger_enabled_) {
|
||||
// only send compiled graphs once at the initial step.
|
||||
auto dbg_graph_ptr = graph_ptr_;
|
||||
// use current graph ptr to load parameters
|
||||
graph_ptr_ = graph_ptr;
|
||||
LoadParametersAndConst();
|
||||
// revert graph ptr to original value
|
||||
graph_ptr_ = dbg_graph_ptr;
|
||||
|
||||
SendMultiGraphsAndSuspend(graph_proto_list_);
|
||||
|
||||
graph_proto_list_.clear();
|
||||
}
|
||||
}
|
||||
|
||||
bool Debugger::DumpDataEnabledIteration() const {
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
if (!dump_json_parser.e2e_dump_enabled()) {
|
||||
|
@ -382,6 +388,7 @@ uint32_t Debugger::GetRankID() {
|
|||
uint32_t rank_id = device_context->GetRankID();
|
||||
return rank_id;
|
||||
}
|
||||
|
||||
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
|
||||
uint32_t rank_id = GetRankID();
|
||||
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
|
||||
|
@ -406,6 +413,7 @@ void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const {
|
|||
E2eDump::DumpSetup(kernel_graph.get(), rank_id);
|
||||
MS_LOG(INFO) << "Finish!";
|
||||
}
|
||||
|
||||
void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
|
||||
// This function will be called for new GPU runtime using MindRTBackend
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
|
@ -491,6 +499,7 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
|
|||
}
|
||||
return false;
|
||||
}
|
||||
|
||||
void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
|
||||
// access lock for public method
|
||||
std::lock_guard<std::mutex> a_lock(access_lock_);
|
||||
|
@ -1020,7 +1029,7 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
|
|||
// add tensor to result list and increment result_index to check next item in ret_name
|
||||
tensor_list.push_back(tensor_item);
|
||||
if (size_iter > INT_MAX - g_chunk_size) {
|
||||
MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow!";
|
||||
MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow!";
|
||||
}
|
||||
size_iter += g_chunk_size;
|
||||
}
|
||||
|
@ -1434,6 +1443,7 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
|
|||
++num_step_;
|
||||
}
|
||||
}
|
||||
|
||||
void Debugger::UpdateStepNumGPU() {
|
||||
// UpdateStepNum with DebugActor::DebugOnStepEnd
|
||||
if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) {
|
||||
|
@ -1452,6 +1462,7 @@ void Debugger::ClearCurrentData() {
|
|||
}
|
||||
}
|
||||
}
|
||||
|
||||
bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
|
||||
return debug_services_->TensorExistsInCurrent(tensor_name);
|
||||
}
|
||||
|
|
|
@ -189,6 +189,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
|
|||
|
||||
void SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list);
|
||||
|
||||
// send multi_graphs and clear the graph_proto_list_
|
||||
void SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr);
|
||||
|
||||
// wait for command and process command
|
||||
// send command request and process reply in a loop
|
||||
// break if RunCMD
|
||||
|
|
|
@ -35,9 +35,9 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
|
|||
namespace mindspore {
|
||||
static const size_t PARAMETER_OUTPUT_INDEX = 0;
|
||||
|
||||
std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
|
||||
std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
|
||||
// define a vector containing real output number
|
||||
std::vector<int> real_outputs;
|
||||
std::vector<size_t> real_outputs;
|
||||
// P.BatchNorm is used for training and inference
|
||||
// can add the filter list for more operators here....
|
||||
if (node_name == "BatchNorm") {
|
||||
|
@ -46,8 +46,7 @@ std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &out
|
|||
} else {
|
||||
// by default, TensorLoader will load all outputs
|
||||
for (size_t j = 0; j < output_size; ++j) {
|
||||
size_t index = j;
|
||||
real_outputs.push_back(index);
|
||||
real_outputs.push_back(j);
|
||||
}
|
||||
}
|
||||
return real_outputs;
|
||||
|
@ -86,11 +85,11 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, ui
|
|||
auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
|
||||
auto node_name = AnfAlgo::GetCNodeName(cnode);
|
||||
std::string kernel_name = GetKernelNodeName(cnode);
|
||||
std::vector<int> real_outputs = CheckRealOutput(node_name, output_size);
|
||||
std::vector<size_t> real_outputs = CheckRealOutput(node_name, output_size);
|
||||
|
||||
for (int j : real_outputs) {
|
||||
for (size_t j : real_outputs) {
|
||||
auto addr = kernel_outputs[j];
|
||||
auto type = AnfAlgo::GetOutputInferDataType(cnode, (size_t)j);
|
||||
auto type = AnfAlgo::GetOutputInferDataType(cnode, j);
|
||||
// For example, this happens with the Depend op
|
||||
if (type == kMetaTypeNone) {
|
||||
continue;
|
||||
|
|
|
@ -24,7 +24,7 @@ using mindspore::kernel::KernelLaunchInfo;
|
|||
|
||||
namespace mindspore {
|
||||
|
||||
std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size);
|
||||
std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size);
|
||||
|
||||
void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_);
|
||||
|
||||
|
|
|
@ -48,7 +48,6 @@ EventReply GrpcClient::WaitForCommand(const Metadata &metadata) {
|
|||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
grpc::Status status = stub_->WaitCMD(&context, metadata, &reply);
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: WaitForCommand";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
|
@ -61,7 +60,6 @@ EventReply GrpcClient::SendMetadata(const Metadata &metadata) {
|
|||
EventReply reply;
|
||||
grpc::ClientContext context;
|
||||
grpc::Status status = stub_->SendMetadata(&context, metadata, &reply);
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendMetadata";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
|
@ -114,7 +112,6 @@ EventReply GrpcClient::SendGraph(const GraphProto &graph) {
|
|||
}
|
||||
writer->WritesDone();
|
||||
grpc::Status status = writer->Finish();
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendGraph";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
|
@ -136,7 +133,6 @@ EventReply GrpcClient::SendMultiGraphs(const std::list<Chunk> &chunks) {
|
|||
}
|
||||
writer->WritesDone();
|
||||
grpc::Status status = writer->Finish();
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendMultigraphs";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
|
@ -158,7 +154,6 @@ EventReply GrpcClient::SendTensors(const std::list<TensorProto> &tensors) {
|
|||
}
|
||||
writer->WritesDone();
|
||||
grpc::Status status = writer->Finish();
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendTensors";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
|
@ -180,7 +175,6 @@ EventReply GrpcClient::SendWatchpointHits(const std::list<WatchpointHit> &watchp
|
|||
}
|
||||
writer->WritesDone();
|
||||
grpc::Status status = writer->Finish();
|
||||
|
||||
if (!status.ok()) {
|
||||
MS_LOG(ERROR) << "RPC failed: SendWatchpointHits";
|
||||
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
|
||||
|
|
|
@ -18,25 +18,24 @@
|
|||
#include <algorithm>
|
||||
#include <chrono>
|
||||
|
||||
DbgServices::DbgServices(bool verbose) { debug_services_ = new DebugServices(); }
|
||||
DbgServices::DbgServices(bool verbose) { debug_services_ = std::make_shared<DebugServices>(); }
|
||||
|
||||
DbgServices::DbgServices(const DbgServices &other) {
|
||||
MS_LOG(INFO) << "cpp DbgServices object is created via copy";
|
||||
debug_services_ = new DebugServices(*other.debug_services_);
|
||||
debug_services_ = other.debug_services_;
|
||||
}
|
||||
|
||||
DbgServices &DbgServices::operator=(const DbgServices &other) {
|
||||
MS_LOG(INFO) << "cpp DbgServices object is being assigned a different state";
|
||||
if (this != &other) {
|
||||
delete debug_services_;
|
||||
debug_services_ = new DebugServices(*other.debug_services_);
|
||||
debug_services_ = other.debug_services_;
|
||||
}
|
||||
return *this;
|
||||
}
|
||||
|
||||
DbgServices::~DbgServices() noexcept {
|
||||
MS_LOG(INFO) << "cpp DbgServices object is deleted";
|
||||
delete debug_services_;
|
||||
debug_services_ = nullptr;
|
||||
}
|
||||
|
||||
std::string DbgServices::GetVersion() const {
|
||||
|
@ -70,25 +69,26 @@ int32_t DbgServices::AddWatchpoint(
|
|||
unsigned int id, unsigned int watch_condition,
|
||||
std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
|
||||
std::vector<parameter_t> parameter_list) {
|
||||
MS_LOG(INFO) << "cpp start";
|
||||
MS_EXCEPTION_IF_NULL(debug_services_);
|
||||
MS_LOG(INFO) << "cpp DbgServices start AddWatchpoint";
|
||||
|
||||
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint id " << id;
|
||||
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint watch_condition " << watch_condition;
|
||||
for (auto const &node : check_nodes) {
|
||||
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint name " << node.first;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint name " << node.first;
|
||||
auto attr_map = node.second;
|
||||
|
||||
bool is_output = std::get<bool>(attr_map["is_output"]);
|
||||
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint is_output " << is_output;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint is_output " << is_output;
|
||||
|
||||
std::vector<std::string> rank_id_str = std::get<std::vector<std::string>>(attr_map["rank_id"]);
|
||||
std::vector<std::uint32_t> rank_id;
|
||||
(void)std::transform(
|
||||
rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
|
||||
[](std::string &id_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(id_str)); });
|
||||
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint rank_id ";
|
||||
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint rank_id: ";
|
||||
for (auto const &i : rank_id) {
|
||||
MS_LOG(INFO) << i << " ";
|
||||
MS_LOG(DEBUG) << i << " ";
|
||||
}
|
||||
|
||||
// std::vector<uint32_t> root_graph_id = std::get<std::vector<uint32_t>>(attr_map["root_graph_id"]);
|
||||
|
@ -97,9 +97,9 @@ int32_t DbgServices::AddWatchpoint(
|
|||
(void)std::transform(
|
||||
root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
|
||||
[](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
|
||||
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint root_graph_id";
|
||||
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint root_graph_id: ";
|
||||
for (auto const &j : root_graph_id) {
|
||||
MS_LOG(INFO) << j << " ";
|
||||
MS_LOG(DEBUG) << j << " ";
|
||||
}
|
||||
}
|
||||
|
||||
|
@ -154,17 +154,19 @@ int32_t DbgServices::AddWatchpoint(
|
|||
|
||||
debug_services_->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend,
|
||||
&check_node_device_list, &check_node_graph_list);
|
||||
MS_LOG(INFO) << "cpp end";
|
||||
MS_LOG(INFO) << "cpp DbgServices end AddWatchpoint";
|
||||
return 0;
|
||||
}
|
||||
|
||||
int32_t DbgServices::RemoveWatchpoint(unsigned int id) {
|
||||
MS_EXCEPTION_IF_NULL(debug_services_);
|
||||
MS_LOG(INFO) << "cpp DbgServices RemoveWatchpoint id " << id;
|
||||
debug_services_->RemoveWatchpoint(id);
|
||||
return 0;
|
||||
}
|
||||
|
||||
std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iteration) {
|
||||
MS_EXCEPTION_IF_NULL(debug_services_);
|
||||
MS_LOG(INFO) << "cpp DbgServices CheckWatchpoint iteration " << iteration;
|
||||
|
||||
std::vector<std::string> name;
|
||||
|
@ -197,19 +199,19 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
|
|||
watchpoint_hit_t hit(name[i], std::stoi(slot[i]), condition[i], watchpoint_id[i], api_parameter_vector,
|
||||
error_codes[i], rank_id[i], root_graph_id[i]);
|
||||
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t rank_id " << hit.rank_id;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t rank_id " << hit.rank_id;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id;
|
||||
|
||||
for (auto const ¶meter_i : api_parameter_vector) {
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit;
|
||||
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit;
|
||||
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value;
|
||||
}
|
||||
|
||||
hits.push_back(hit);
|
||||
|
@ -230,6 +232,7 @@ unsigned int GetTensorSlot(const tensor_info_t info) { return info.slot; }
|
|||
bool GetTensorIsOutput(const tensor_info_t info) { return info.is_output; }
|
||||
|
||||
std::vector<std::shared_ptr<TensorData>> DbgServices::ReadTensorsUtil(std::vector<tensor_info_t> info) {
|
||||
MS_EXCEPTION_IF_NULL(debug_services_);
|
||||
for (auto i : info) {
|
||||
MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration "
|
||||
<< i.iteration << ", rank_id " << i.rank_id << ", root_graph_id " << i.root_graph_id << ", is_output "
|
||||
|
@ -284,6 +287,7 @@ std::vector<tensor_data_t> DbgServices::ReadTensors(const std::vector<tensor_inf
|
|||
std::vector<std::shared_ptr<TensorData>> result_list;
|
||||
result_list = ReadTensorsUtil(info);
|
||||
for (auto result : result_list) {
|
||||
MS_EXCEPTION_IF_NULL(result);
|
||||
tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape());
|
||||
tensors_read.push_back(tensor_data_item);
|
||||
}
|
||||
|
|
|
@ -103,7 +103,7 @@ struct tensor_info_t {
|
|||
struct tensor_data_t {
|
||||
tensor_data_t(char *data_ptr, uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
|
||||
: data_size(data_size), dtype(dtype), shape(shape) {
|
||||
if (data_ptr != NULL) {
|
||||
if (data_ptr != nullptr) {
|
||||
this->data_ptr = py::bytes(data_ptr, data_size);
|
||||
} else {
|
||||
this->data_ptr = py::bytes();
|
||||
|
@ -182,9 +182,6 @@ struct TensorStatData {
|
|||
};
|
||||
|
||||
class DbgServices {
|
||||
private:
|
||||
DebugServices *debug_services_;
|
||||
|
||||
public:
|
||||
explicit DbgServices(bool verbose = false);
|
||||
|
||||
|
@ -215,6 +212,9 @@ class DbgServices {
|
|||
std::vector<TensorStatData> ReadTensorsStat(const std::vector<tensor_info_t> info);
|
||||
|
||||
std::string GetVersion() const;
|
||||
|
||||
private:
|
||||
std::shared_ptr<DebugServices> debug_services_ = nullptr;
|
||||
};
|
||||
|
||||
#endif // DEBUG_DBG_SERVICES_H_
|
||||
|
|
|
@ -328,10 +328,10 @@ void TensorSummary<T>::InitCalculators(const std::vector<DebugServices::watchpoi
|
|||
range_counts_[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value);
|
||||
}
|
||||
} else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr_) {
|
||||
(void)means_.insert({"curr_prev_diff_mean", std::make_unique<MeanCalculator>()});
|
||||
(void)means_.insert({"abs_prev_mean", std::make_unique<MeanCalculator>()});
|
||||
(void)means_.emplace("curr_prev_diff_mean", std::make_unique<MeanCalculator>());
|
||||
(void)means_.emplace("abs_prev_mean", std::make_unique<MeanCalculator>());
|
||||
} else if (wp.abs_mean_enabled()) {
|
||||
(void)means_.insert({"abs_current_mean", std::make_unique<MeanCalculator>()});
|
||||
(void)means_.emplace("abs_current_mean", std::make_unique<MeanCalculator>());
|
||||
}
|
||||
}
|
||||
}
|
||||
|
|
|
@ -150,6 +150,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
|
|||
return ret;
|
||||
}
|
||||
auto tensor_data = std::make_shared<mindspore::TensorData>();
|
||||
MS_EXCEPTION_IF_NULL(tensor_data);
|
||||
tensor_data->SetName(tensor_name);
|
||||
tensor_data->SetExecutionOrder(execution_order);
|
||||
tensor_data->SetSlot(slot);
|
||||
|
|
|
@ -57,7 +57,7 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
|
|||
} else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger) {
|
||||
if (debugger != nullptr) {
|
||||
std::string kernel_name = cnode->fullname_with_scope();
|
||||
debugger->SetCurNode(kernel_name);
|
||||
bool read_data = CheckReadData(cnode);
|
||||
|
@ -111,7 +111,7 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
|
|||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger) {
|
||||
if (debugger != nullptr) {
|
||||
debugger->Debugger::UpdateStepNumGPU();
|
||||
// Reset exec_order for the next step
|
||||
exec_order_ = 0;
|
||||
|
|
|
@ -28,7 +28,9 @@ import numpy as np
|
|||
|
||||
|
||||
class ConvertToolLoader:
|
||||
"""Module to load CANN conversion tool."""
|
||||
"""
|
||||
Module to load CANN conversion tool.
|
||||
"""
|
||||
|
||||
def __init__(self):
|
||||
self.utils = None
|
||||
|
@ -44,7 +46,9 @@ class ConvertToolLoader:
|
|||
|
||||
@staticmethod
|
||||
def find_toolkit_path():
|
||||
"""Find the path to Ascend toolkit."""
|
||||
"""
|
||||
Find the path to Ascend toolkit.
|
||||
"""
|
||||
ascend_toolkit_path = os.getenv("ASCEND_TOOLKIT_PATH")
|
||||
if not ascend_toolkit_path:
|
||||
ascend_toolkit_path = "/usr/local/Ascend"
|
||||
|
@ -63,7 +67,9 @@ class ConvertToolLoader:
|
|||
return msaccucmp_file_list[0].parent
|
||||
|
||||
def load_convert_tool(self):
|
||||
"""load CANN conversion tool from the toolkit path."""
|
||||
"""
|
||||
Load CANN conversion tool from the toolkit path.
|
||||
"""
|
||||
# add toolkit path to system searching module path
|
||||
if str(self.toolkit_path) not in sys.path:
|
||||
sys.path.insert(0, str(self.toolkit_path))
|
||||
|
@ -99,13 +105,17 @@ class ConvertToolLoader:
|
|||
self.compare_exception = self.utils.CompareError
|
||||
|
||||
def reset_system_path(self):
|
||||
# restore system searching module path
|
||||
"""
|
||||
Restore system searching module path
|
||||
"""
|
||||
if str(self.toolkit_path) in sys.path:
|
||||
sys.path.remove(str(self.toolkit_path))
|
||||
|
||||
|
||||
def parse_args(file_list, output_path):
|
||||
"""Helper function to parse the input argument for the conversion configuration."""
|
||||
"""
|
||||
Helper function to parse the input argument for the conversion configuration.
|
||||
"""
|
||||
args_dict = dict()
|
||||
args_dict['dump_version'] = '2.0'
|
||||
args_dict['format'] = 'NCHW'
|
||||
|
@ -122,7 +132,9 @@ def parse_args(file_list, output_path):
|
|||
|
||||
|
||||
class AsyncDumpConverter:
|
||||
"""Convert the target async dump data into npy files."""
|
||||
"""
|
||||
Convert the target async dump data into npy files.
|
||||
"""
|
||||
|
||||
def __init__(self, file_list, output_path):
|
||||
# check input path
|
||||
|
@ -138,12 +150,16 @@ class AsyncDumpConverter:
|
|||
self.clear_failed_list_file()
|
||||
|
||||
def clear_failed_list_file(self):
|
||||
"""Remove existing failed txt file."""
|
||||
"""
|
||||
Remove existing failed txt file.
|
||||
"""
|
||||
if self.failed_file_path and os.path.exists(self.failed_file_path):
|
||||
os.remove(self.failed_file_path)
|
||||
|
||||
def convert_files(self):
|
||||
"""Main entry of the converter to convert async dump files into npy format."""
|
||||
"""
|
||||
Main entry of the converter to convert async dump files into npy format.
|
||||
"""
|
||||
self.convert_tool.log.print_info_log('Start to convert async dump files.')
|
||||
try:
|
||||
if self.args.format is not None:
|
||||
|
@ -164,7 +180,9 @@ class AsyncDumpConverter:
|
|||
self.convert_tool.log.print_info_log('Finish to convert async dump files.')
|
||||
|
||||
def convert_failed_tensors(self):
|
||||
"""Convert the failed tensor recorded in the failed txt file."""
|
||||
"""
|
||||
Convert the failed tensor recorded in the failed txt file.
|
||||
"""
|
||||
self.convert_tool.log.print_info_log(
|
||||
'Start to convert failed tensors recorded in ' + self.failed_file_path + '.')
|
||||
with open(self.failed_file_path) as failed_lines:
|
||||
|
@ -177,7 +195,9 @@ class AsyncDumpConverter:
|
|||
'Failed to convert ' + failed_line + ' to Host format: ' + str(err))
|
||||
|
||||
def convert_one_failed_tensor(self, failed_tensor):
|
||||
"""Convert failed operator one by one."""
|
||||
"""
|
||||
Convert failed operator one by one.
|
||||
"""
|
||||
if len(failed_tensor) <= 1:
|
||||
raise ValueError(
|
||||
"Invalid tensor info in convert_failed_file_list.txt")
|
||||
|
@ -191,11 +211,13 @@ class AsyncDumpConverter:
|
|||
tensor = getattr(op_data, tensor_type)[index]
|
||||
dump_data_array = self.convert_tool.utils.deserialize_dump_data_to_array(tensor)
|
||||
array = dump_data_array.reshape(tensor.shape.dim)
|
||||
self._save_tensor_to_npy_file(
|
||||
file_path, tensor_type, index, tensor.format, array)
|
||||
out_path = self._generate_path(file_path, tensor_type, index, tensor.format)
|
||||
self._save_tensor_to_npy_file(out_path, array)
|
||||
|
||||
def handle_multi_process(self, convert_obj, files):
|
||||
"""Convert async format files to npy in a multithreaded manner."""
|
||||
"""
|
||||
Convert async format files to npy in a multithreaded manner.
|
||||
"""
|
||||
return_code = self.convert_tool.compare_none_error
|
||||
# try looking for function in compatibility with the toolkit package version.
|
||||
progress = self.convert_tool.progress(len(files))
|
||||
|
@ -223,7 +245,9 @@ class AsyncDumpConverter:
|
|||
return return_code
|
||||
|
||||
def _get_file_list(self, files, convert_obj):
|
||||
"""Process to get file lists in multi_process."""
|
||||
"""
|
||||
Process to get file lists in multi_process.
|
||||
"""
|
||||
multi_process_file_list = []
|
||||
big_file_list = []
|
||||
max_file_size = 0
|
||||
|
@ -241,7 +265,9 @@ class AsyncDumpConverter:
|
|||
return multi_process_file_list, big_file_list
|
||||
|
||||
def _process_big_file(self, big_file_list, convert_obj):
|
||||
"""Process big file in multi_process."""
|
||||
"""
|
||||
Process big file in multi_process.
|
||||
"""
|
||||
return_code = self.convert_tool.compare_none_error
|
||||
for big_file in big_file_list:
|
||||
if hasattr(convert_obj, '_convert_format_for_one_file'):
|
||||
|
@ -256,8 +282,18 @@ class AsyncDumpConverter:
|
|||
return_code = ret_bf
|
||||
return return_code
|
||||
|
||||
def _save_tensor_to_npy_file(self, file_path, tensor_type, idx, tensor_format, dump_data_array):
|
||||
"""Save tensor file into npy format."""
|
||||
@staticmethod
|
||||
def _save_tensor_to_npy_file(out_path, dump_data_array):
|
||||
"""
|
||||
Save tensor file into npy format.
|
||||
"""
|
||||
np.save(out_path, dump_data_array)
|
||||
os.chmod(out_path, stat.S_IRUSR)
|
||||
|
||||
def _generate_path(self, file_path, tensor_type, idx, tensor_format):
|
||||
"""
|
||||
Generate path and filename to the target npy files
|
||||
"""
|
||||
file_name = os.path.basename(file_path)
|
||||
name_splits = file_name.split('.')
|
||||
name_splits[1] = name_splits[1].split('_')[-1]
|
||||
|
@ -268,12 +304,12 @@ class AsyncDumpConverter:
|
|||
idx,
|
||||
self.convert_tool.common.get_format_string(tensor_format)
|
||||
)
|
||||
out_path = os.path.join(self.output_path, out_file_name)
|
||||
np.save(out_path, dump_data_array)
|
||||
os.chmod(out_path, stat.S_IRUSR)
|
||||
return os.path.join(self.output_path, out_file_name)
|
||||
|
||||
def _rename_generated_npy_files(self):
|
||||
"""In order to follow dump naming convention, rename npy files generated by CANN conversion tool."""
|
||||
"""
|
||||
In order to follow dump naming convention, rename npy files generated by CANN conversion tool.
|
||||
"""
|
||||
target_file_list = []
|
||||
for in_file in self.files_to_convert:
|
||||
target_file_list.extend(glob.glob(in_file + "*.npy"))
|
||||
|
|
|
@ -31,7 +31,7 @@ def get_version():
|
|||
Function to return offline Debug Services version.
|
||||
|
||||
Returns:
|
||||
version (str): dbgServices version.
|
||||
version (str): DbgServices version.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
|
||||
|
@ -48,7 +48,7 @@ class DbgLogger:
|
|||
Offline Debug Services Logger
|
||||
|
||||
Args:
|
||||
verbose (bool): whether to print logs.
|
||||
verbose (bool): Whether to print logs.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
|
||||
|
@ -70,8 +70,8 @@ class DbgServices:
|
|||
Offline Debug Services class.
|
||||
|
||||
Args:
|
||||
dump_file_path (str): directory where the dump files are saved.
|
||||
verbose (bool): whether to print logs (default: False)..
|
||||
dump_file_path (str): Directory where the dump files are saved.
|
||||
verbose (bool): Whether to print logs. Default: False.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
|
||||
|
@ -200,7 +200,7 @@ class DbgServices:
|
|||
Removing watchpoint from Debug Service instance.
|
||||
|
||||
Args:
|
||||
watchpoint_id (int): Watchpoint id
|
||||
watchpoint_id (int): Watchpoint id.
|
||||
|
||||
Returns:
|
||||
Debug Service instance with removed watchpoint.
|
||||
|
@ -733,17 +733,17 @@ class TensorStatData:
|
|||
data_size (int): Size of data in bytes.
|
||||
dtype (int): An encoding representing the type of TensorData.
|
||||
shape (list): Shape of tensor.
|
||||
is_bool (bool): Whether the data type is bool
|
||||
max_value (float): Maximum value in tensor's elements
|
||||
min_value (float): Minimum value in tensor's elements
|
||||
avg_value (float): Average value of all tensor's elements
|
||||
count (int): Number of elements in tensor
|
||||
neg_zero_count (int): Number of negative elements in tensor
|
||||
pos_zero_count (int): Number of positive elements in tensor
|
||||
nan_cout (int): Number of nan elements in tensor
|
||||
neg_inf_count (int): Number of negative infinity elements in tensor
|
||||
pos_inf_count (int): Number of positive infinity elements in tensor
|
||||
zero_count (int): Total number of zero elements in tensor
|
||||
is_bool (bool): Whether the data type is bool.
|
||||
max_value (float): Maximum value in tensor's elements.
|
||||
min_value (float): Minimum value in tensor's elements.
|
||||
avg_value (float): Average value of all tensor's elements.
|
||||
count (int): Number of elements in tensor.
|
||||
neg_zero_count (int): Number of negative elements in tensor.
|
||||
pos_zero_count (int): Number of positive elements in tensor.
|
||||
nan_cout (int): Number of nan elements in tensor.
|
||||
neg_inf_count (int): Number of negative infinity elements in tensor.
|
||||
pos_inf_count (int): Number of positive infinity elements in tensor.
|
||||
zero_count (int): Total number of zero elements in tensor.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
|
||||
|
|
Loading…
Reference in New Issue