!24515 Sync code self check to master

Merge pull request !24515 from TinaMengtingZhang/code_self_check_sep_master
This commit is contained in:
i-robot 2021-10-05 22:41:50 +00:00 committed by Gitee
commit db4669f3d1
12 changed files with 150 additions and 102 deletions

View File

@ -294,6 +294,7 @@ void Debugger::PreExecuteGraphDebugger(const std::vector<KernelGraphPtr> &graphs
DumpSetup(graph);
}
}
void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
MS_EXCEPTION_IF_NULL(graph_ptr);
// access lock for public method
@ -313,23 +314,7 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
if (graph_proto_list_.size() > 1) {
// there are more than one graphs are not dataset_graph
if (not_dataset_graph_sum_ > 0) {
// only try to enable debugger if they are not all dataset graphs
if (!debugger_enabled_) {
EnableDebugger();
}
if (debugger_enabled_) {
// only send compiled graphs once at the initial step.
auto dbg_graph_ptr = graph_ptr_;
// use current graph ptr to load parameters
graph_ptr_ = graph_ptr;
LoadParametersAndConst();
// revert graph ptr to original value
graph_ptr_ = dbg_graph_ptr;
SendMultiGraphsAndSuspend(graph_proto_list_);
graph_proto_list_.clear();
}
SendMultiGraphsAndClear(graph_ptr);
}
} else if (graph_proto_list_.size() == 1) {
// single graph, and not the initial step
@ -359,6 +344,27 @@ void Debugger::PreExecute(const KernelGraphPtr &graph_ptr) {
// resets for the new graph
suspended_at_last_kernel_ = false;
}
void Debugger::SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr) {
// only try to enable debugger if they are not all dataset graphs
if (!debugger_enabled_) {
EnableDebugger();
}
if (debugger_enabled_) {
// only send compiled graphs once at the initial step.
auto dbg_graph_ptr = graph_ptr_;
// use current graph ptr to load parameters
graph_ptr_ = graph_ptr;
LoadParametersAndConst();
// revert graph ptr to original value
graph_ptr_ = dbg_graph_ptr;
SendMultiGraphsAndSuspend(graph_proto_list_);
graph_proto_list_.clear();
}
}
bool Debugger::DumpDataEnabledIteration() const {
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (!dump_json_parser.e2e_dump_enabled()) {
@ -382,6 +388,7 @@ uint32_t Debugger::GetRankID() {
uint32_t rank_id = device_context->GetRankID();
return rank_id;
}
void Debugger::Dump(const KernelGraphPtr &kernel_graph) const {
uint32_t rank_id = GetRankID();
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
@ -406,6 +413,7 @@ void Debugger::DumpSetup(const KernelGraphPtr &kernel_graph) const {
E2eDump::DumpSetup(kernel_graph.get(), rank_id);
MS_LOG(INFO) << "Finish!";
}
void Debugger::DumpInGraphCompiler(const KernelGraphPtr &kernel_graph) {
// This function will be called for new GPU runtime using MindRTBackend
auto &json_parser = DumpJsonParser::GetInstance();
@ -491,6 +499,7 @@ bool Debugger::ReadNodeDataRequired(const CNodePtr &kernel) const {
}
return false;
}
void Debugger::PostExecuteNode(const CNodePtr &kernel, bool last_kernel) {
// access lock for public method
std::lock_guard<std::mutex> a_lock(access_lock_);
@ -1020,7 +1029,7 @@ std::list<TensorProto> Debugger::LoadTensors(const ProtoVector<TensorProto> &ten
// add tensor to result list and increment result_index to check next item in ret_name
tensor_list.push_back(tensor_item);
if (size_iter > INT_MAX - g_chunk_size) {
MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow";
MS_EXCEPTION(ValueError) << size_iter << " + " << g_chunk_size << " would lead to integer overflow!";
}
size_iter += g_chunk_size;
}
@ -1434,6 +1443,7 @@ void Debugger::UpdateStepNum(const session::KernelGraph *graph) {
++num_step_;
}
}
void Debugger::UpdateStepNumGPU() {
// UpdateStepNum with DebugActor::DebugOnStepEnd
if (device_target_ == kGPUDevice && (debugger_enabled_ || DumpDataEnabledIteration())) {
@ -1452,6 +1462,7 @@ void Debugger::ClearCurrentData() {
}
}
}
bool Debugger::TensorExistsInCurrent(const std::string &tensor_name) {
return debug_services_->TensorExistsInCurrent(tensor_name);
}

View File

@ -189,6 +189,9 @@ class Debugger : public std::enable_shared_from_this<Debugger> {
void SendMultiGraphsAndSuspend(const std::list<GraphProto> &graph_proto_list);
// send multi_graphs and clear the graph_proto_list_
void SendMultiGraphsAndClear(const KernelGraphPtr &graph_ptr);
// wait for command and process command
// send command request and process reply in a loop
// break if RunCMD

View File

@ -35,9 +35,9 @@ using AnfAlgo = mindspore::session::AnfRuntimeAlgorithm;
namespace mindspore {
static const size_t PARAMETER_OUTPUT_INDEX = 0;
std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size) {
// define a vector containing real output number
std::vector<int> real_outputs;
std::vector<size_t> real_outputs;
// P.BatchNorm is used for training and inference
// can add the filter list for more operators here....
if (node_name == "BatchNorm") {
@ -46,8 +46,7 @@ std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &out
} else {
// by default, TensorLoader will load all outputs
for (size_t j = 0; j < output_size; ++j) {
size_t index = j;
real_outputs.push_back(index);
real_outputs.push_back(j);
}
}
return real_outputs;
@ -86,11 +85,11 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, ui
auto output_size = AnfAlgo::GetOutputTensorNum(cnode);
auto node_name = AnfAlgo::GetCNodeName(cnode);
std::string kernel_name = GetKernelNodeName(cnode);
std::vector<int> real_outputs = CheckRealOutput(node_name, output_size);
std::vector<size_t> real_outputs = CheckRealOutput(node_name, output_size);
for (int j : real_outputs) {
for (size_t j : real_outputs) {
auto addr = kernel_outputs[j];
auto type = AnfAlgo::GetOutputInferDataType(cnode, (size_t)j);
auto type = AnfAlgo::GetOutputInferDataType(cnode, j);
// For example, this happens with the Depend op
if (type == kMetaTypeNone) {
continue;

View File

@ -24,7 +24,7 @@ using mindspore::kernel::KernelLaunchInfo;
namespace mindspore {
std::vector<int> CheckRealOutput(const std::string &node_name, const size_t &output_size);
std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size);
void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info_, uint32_t exec_order_);

View File

@ -48,7 +48,6 @@ EventReply GrpcClient::WaitForCommand(const Metadata &metadata) {
EventReply reply;
grpc::ClientContext context;
grpc::Status status = stub_->WaitCMD(&context, metadata, &reply);
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: WaitForCommand";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
@ -61,7 +60,6 @@ EventReply GrpcClient::SendMetadata(const Metadata &metadata) {
EventReply reply;
grpc::ClientContext context;
grpc::Status status = stub_->SendMetadata(&context, metadata, &reply);
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendMetadata";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
@ -114,7 +112,6 @@ EventReply GrpcClient::SendGraph(const GraphProto &graph) {
}
writer->WritesDone();
grpc::Status status = writer->Finish();
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendGraph";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
@ -136,7 +133,6 @@ EventReply GrpcClient::SendMultiGraphs(const std::list<Chunk> &chunks) {
}
writer->WritesDone();
grpc::Status status = writer->Finish();
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendMultigraphs";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
@ -158,7 +154,6 @@ EventReply GrpcClient::SendTensors(const std::list<TensorProto> &tensors) {
}
writer->WritesDone();
grpc::Status status = writer->Finish();
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendTensors";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();
@ -180,7 +175,6 @@ EventReply GrpcClient::SendWatchpointHits(const std::list<WatchpointHit> &watchp
}
writer->WritesDone();
grpc::Status status = writer->Finish();
if (!status.ok()) {
MS_LOG(ERROR) << "RPC failed: SendWatchpointHits";
MS_LOG(ERROR) << status.error_code() << ": " << status.error_message();

View File

@ -18,25 +18,24 @@
#include <algorithm>
#include <chrono>
DbgServices::DbgServices(bool verbose) { debug_services_ = new DebugServices(); }
DbgServices::DbgServices(bool verbose) { debug_services_ = std::make_shared<DebugServices>(); }
DbgServices::DbgServices(const DbgServices &other) {
MS_LOG(INFO) << "cpp DbgServices object is created via copy";
debug_services_ = new DebugServices(*other.debug_services_);
debug_services_ = other.debug_services_;
}
DbgServices &DbgServices::operator=(const DbgServices &other) {
MS_LOG(INFO) << "cpp DbgServices object is being assigned a different state";
if (this != &other) {
delete debug_services_;
debug_services_ = new DebugServices(*other.debug_services_);
debug_services_ = other.debug_services_;
}
return *this;
}
DbgServices::~DbgServices() noexcept {
MS_LOG(INFO) << "cpp DbgServices object is deleted";
delete debug_services_;
debug_services_ = nullptr;
}
std::string DbgServices::GetVersion() const {
@ -70,25 +69,26 @@ int32_t DbgServices::AddWatchpoint(
unsigned int id, unsigned int watch_condition,
std::map<std::string, std::map<std::string, std::variant<bool, std::vector<std::string>>>> check_nodes,
std::vector<parameter_t> parameter_list) {
MS_LOG(INFO) << "cpp start";
MS_EXCEPTION_IF_NULL(debug_services_);
MS_LOG(INFO) << "cpp DbgServices start AddWatchpoint";
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint id " << id;
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint watch_condition " << watch_condition;
for (auto const &node : check_nodes) {
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint name " << node.first;
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint name " << node.first;
auto attr_map = node.second;
bool is_output = std::get<bool>(attr_map["is_output"]);
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint is_output " << is_output;
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint is_output " << is_output;
std::vector<std::string> rank_id_str = std::get<std::vector<std::string>>(attr_map["rank_id"]);
std::vector<std::uint32_t> rank_id;
(void)std::transform(
rank_id_str.begin(), rank_id_str.end(), std::back_inserter(rank_id),
[](std::string &id_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(id_str)); });
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint rank_id ";
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint rank_id: ";
for (auto const &i : rank_id) {
MS_LOG(INFO) << i << " ";
MS_LOG(DEBUG) << i << " ";
}
// std::vector<uint32_t> root_graph_id = std::get<std::vector<uint32_t>>(attr_map["root_graph_id"]);
@ -97,9 +97,9 @@ int32_t DbgServices::AddWatchpoint(
(void)std::transform(
root_graph_id_str.begin(), root_graph_id_str.end(), std::back_inserter(root_graph_id),
[](std::string &graph_str) -> std::uint32_t { return static_cast<uint32_t>(std::stoul(graph_str)); });
MS_LOG(INFO) << "cpp DbgServices AddWatchpoint root_graph_id";
MS_LOG(DEBUG) << "cpp DbgServices AddWatchpoint root_graph_id: ";
for (auto const &j : root_graph_id) {
MS_LOG(INFO) << j << " ";
MS_LOG(DEBUG) << j << " ";
}
}
@ -154,17 +154,19 @@ int32_t DbgServices::AddWatchpoint(
debug_services_->AddWatchpoint(id, watch_condition, 0, check_node_list, parameter_list_backend,
&check_node_device_list, &check_node_graph_list);
MS_LOG(INFO) << "cpp end";
MS_LOG(INFO) << "cpp DbgServices end AddWatchpoint";
return 0;
}
int32_t DbgServices::RemoveWatchpoint(unsigned int id) {
MS_EXCEPTION_IF_NULL(debug_services_);
MS_LOG(INFO) << "cpp DbgServices RemoveWatchpoint id " << id;
debug_services_->RemoveWatchpoint(id);
return 0;
}
std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iteration) {
MS_EXCEPTION_IF_NULL(debug_services_);
MS_LOG(INFO) << "cpp DbgServices CheckWatchpoint iteration " << iteration;
std::vector<std::string> name;
@ -197,19 +199,19 @@ std::vector<watchpoint_hit_t> DbgServices::CheckWatchpoints(unsigned int iterati
watchpoint_hit_t hit(name[i], std::stoi(slot[i]), condition[i], watchpoint_id[i], api_parameter_vector,
error_codes[i], rank_id[i], root_graph_id[i]);
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t rank_id " << hit.rank_id;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t name " << hit.name;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t slot " << hit.slot;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t watchpoint_id " << hit.watchpoint_id;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t error_code " << hit.error_code;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t rank_id " << hit.rank_id;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t root_graph_id " << hit.root_graph_id;
for (auto const &parameter_i : api_parameter_vector) {
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit;
MS_LOG(INFO) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter name " << parameter_i.name;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter disabled " << parameter_i.disabled;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter value " << parameter_i.value;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter hit " << parameter_i.hit;
MS_LOG(DEBUG) << "cpp DbgServices watchpoint_hit_t parameter actual_value " << parameter_i.actual_value;
}
hits.push_back(hit);
@ -230,6 +232,7 @@ unsigned int GetTensorSlot(const tensor_info_t info) { return info.slot; }
bool GetTensorIsOutput(const tensor_info_t info) { return info.is_output; }
std::vector<std::shared_ptr<TensorData>> DbgServices::ReadTensorsUtil(std::vector<tensor_info_t> info) {
MS_EXCEPTION_IF_NULL(debug_services_);
for (auto i : info) {
MS_LOG(INFO) << "cpp DbgServices ReadTensor info name " << i.node_name << ", slot " << i.slot << ", iteration "
<< i.iteration << ", rank_id " << i.rank_id << ", root_graph_id " << i.root_graph_id << ", is_output "
@ -284,6 +287,7 @@ std::vector<tensor_data_t> DbgServices::ReadTensors(const std::vector<tensor_inf
std::vector<std::shared_ptr<TensorData>> result_list;
result_list = ReadTensorsUtil(info);
for (auto result : result_list) {
MS_EXCEPTION_IF_NULL(result);
tensor_data_t tensor_data_item(result->GetDataPtr(), result->GetByteSize(), result->GetType(), result->GetShape());
tensors_read.push_back(tensor_data_item);
}

View File

@ -103,7 +103,7 @@ struct tensor_info_t {
struct tensor_data_t {
tensor_data_t(char *data_ptr, uint64_t data_size, int dtype, const std::vector<int64_t> &shape)
: data_size(data_size), dtype(dtype), shape(shape) {
if (data_ptr != NULL) {
if (data_ptr != nullptr) {
this->data_ptr = py::bytes(data_ptr, data_size);
} else {
this->data_ptr = py::bytes();
@ -182,9 +182,6 @@ struct TensorStatData {
};
class DbgServices {
private:
DebugServices *debug_services_;
public:
explicit DbgServices(bool verbose = false);
@ -215,6 +212,9 @@ class DbgServices {
std::vector<TensorStatData> ReadTensorsStat(const std::vector<tensor_info_t> info);
std::string GetVersion() const;
private:
std::shared_ptr<DebugServices> debug_services_ = nullptr;
};
#endif // DEBUG_DBG_SERVICES_H_

View File

@ -328,10 +328,10 @@ void TensorSummary<T>::InitCalculators(const std::vector<DebugServices::watchpoi
range_counts_[wp_id]->set_range_end_inclusive(wp.parameter_list[1].value);
}
} else if (wp.tensor_update_ratio_mean_enabled() && prev_tensor_ptr_) {
(void)means_.insert({"curr_prev_diff_mean", std::make_unique<MeanCalculator>()});
(void)means_.insert({"abs_prev_mean", std::make_unique<MeanCalculator>()});
(void)means_.emplace("curr_prev_diff_mean", std::make_unique<MeanCalculator>());
(void)means_.emplace("abs_prev_mean", std::make_unique<MeanCalculator>());
} else if (wp.abs_mean_enabled()) {
(void)means_.insert({"abs_current_mean", std::make_unique<MeanCalculator>()});
(void)means_.emplace("abs_current_mean", std::make_unique<MeanCalculator>());
}
}
}

View File

@ -150,6 +150,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
return ret;
}
auto tensor_data = std::make_shared<mindspore::TensorData>();
MS_EXCEPTION_IF_NULL(tensor_data);
tensor_data->SetName(tensor_name);
tensor_data->SetExecutionOrder(execution_order);
tensor_data->SetSlot(slot);

View File

@ -57,7 +57,7 @@ void DebugActor::Debug(const AnfNodePtr &node, const KernelLaunchInfo *launch_in
} else if (device_context->GetDeviceAddressType() == device::DeviceAddressType::kGPU) {
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger) {
if (debugger != nullptr) {
std::string kernel_name = cnode->fullname_with_scope();
debugger->SetCurNode(kernel_name);
bool read_data = CheckReadData(cnode);
@ -111,7 +111,7 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
#ifdef ENABLE_DEBUGGER
auto debugger = Debugger::GetInstance();
if (debugger) {
if (debugger != nullptr) {
debugger->Debugger::UpdateStepNumGPU();
// Reset exec_order for the next step
exec_order_ = 0;

View File

@ -28,7 +28,9 @@ import numpy as np
class ConvertToolLoader:
"""Module to load CANN conversion tool."""
"""
Module to load CANN conversion tool.
"""
def __init__(self):
self.utils = None
@ -44,7 +46,9 @@ class ConvertToolLoader:
@staticmethod
def find_toolkit_path():
"""Find the path to Ascend toolkit."""
"""
Find the path to Ascend toolkit.
"""
ascend_toolkit_path = os.getenv("ASCEND_TOOLKIT_PATH")
if not ascend_toolkit_path:
ascend_toolkit_path = "/usr/local/Ascend"
@ -63,7 +67,9 @@ class ConvertToolLoader:
return msaccucmp_file_list[0].parent
def load_convert_tool(self):
"""load CANN conversion tool from the toolkit path."""
"""
Load CANN conversion tool from the toolkit path.
"""
# add toolkit path to system searching module path
if str(self.toolkit_path) not in sys.path:
sys.path.insert(0, str(self.toolkit_path))
@ -99,13 +105,17 @@ class ConvertToolLoader:
self.compare_exception = self.utils.CompareError
def reset_system_path(self):
# restore system searching module path
"""
Restore system searching module path
"""
if str(self.toolkit_path) in sys.path:
sys.path.remove(str(self.toolkit_path))
def parse_args(file_list, output_path):
"""Helper function to parse the input argument for the conversion configuration."""
"""
Helper function to parse the input argument for the conversion configuration.
"""
args_dict = dict()
args_dict['dump_version'] = '2.0'
args_dict['format'] = 'NCHW'
@ -122,7 +132,9 @@ def parse_args(file_list, output_path):
class AsyncDumpConverter:
"""Convert the target async dump data into npy files."""
"""
Convert the target async dump data into npy files.
"""
def __init__(self, file_list, output_path):
# check input path
@ -138,12 +150,16 @@ class AsyncDumpConverter:
self.clear_failed_list_file()
def clear_failed_list_file(self):
"""Remove existing failed txt file."""
"""
Remove existing failed txt file.
"""
if self.failed_file_path and os.path.exists(self.failed_file_path):
os.remove(self.failed_file_path)
def convert_files(self):
"""Main entry of the converter to convert async dump files into npy format."""
"""
Main entry of the converter to convert async dump files into npy format.
"""
self.convert_tool.log.print_info_log('Start to convert async dump files.')
try:
if self.args.format is not None:
@ -164,7 +180,9 @@ class AsyncDumpConverter:
self.convert_tool.log.print_info_log('Finish to convert async dump files.')
def convert_failed_tensors(self):
"""Convert the failed tensor recorded in the failed txt file."""
"""
Convert the failed tensor recorded in the failed txt file.
"""
self.convert_tool.log.print_info_log(
'Start to convert failed tensors recorded in ' + self.failed_file_path + '.')
with open(self.failed_file_path) as failed_lines:
@ -177,7 +195,9 @@ class AsyncDumpConverter:
'Failed to convert ' + failed_line + ' to Host format: ' + str(err))
def convert_one_failed_tensor(self, failed_tensor):
"""Convert failed operator one by one."""
"""
Convert failed operator one by one.
"""
if len(failed_tensor) <= 1:
raise ValueError(
"Invalid tensor info in convert_failed_file_list.txt")
@ -191,11 +211,13 @@ class AsyncDumpConverter:
tensor = getattr(op_data, tensor_type)[index]
dump_data_array = self.convert_tool.utils.deserialize_dump_data_to_array(tensor)
array = dump_data_array.reshape(tensor.shape.dim)
self._save_tensor_to_npy_file(
file_path, tensor_type, index, tensor.format, array)
out_path = self._generate_path(file_path, tensor_type, index, tensor.format)
self._save_tensor_to_npy_file(out_path, array)
def handle_multi_process(self, convert_obj, files):
"""Convert async format files to npy in a multithreaded manner."""
"""
Convert async format files to npy in a multithreaded manner.
"""
return_code = self.convert_tool.compare_none_error
# try looking for function in compatibility with the toolkit package version.
progress = self.convert_tool.progress(len(files))
@ -223,7 +245,9 @@ class AsyncDumpConverter:
return return_code
def _get_file_list(self, files, convert_obj):
"""Process to get file lists in multi_process."""
"""
Process to get file lists in multi_process.
"""
multi_process_file_list = []
big_file_list = []
max_file_size = 0
@ -241,7 +265,9 @@ class AsyncDumpConverter:
return multi_process_file_list, big_file_list
def _process_big_file(self, big_file_list, convert_obj):
"""Process big file in multi_process."""
"""
Process big file in multi_process.
"""
return_code = self.convert_tool.compare_none_error
for big_file in big_file_list:
if hasattr(convert_obj, '_convert_format_for_one_file'):
@ -256,8 +282,18 @@ class AsyncDumpConverter:
return_code = ret_bf
return return_code
def _save_tensor_to_npy_file(self, file_path, tensor_type, idx, tensor_format, dump_data_array):
"""Save tensor file into npy format."""
@staticmethod
def _save_tensor_to_npy_file(out_path, dump_data_array):
"""
Save tensor file into npy format.
"""
np.save(out_path, dump_data_array)
os.chmod(out_path, stat.S_IRUSR)
def _generate_path(self, file_path, tensor_type, idx, tensor_format):
"""
Generate path and filename to the target npy files
"""
file_name = os.path.basename(file_path)
name_splits = file_name.split('.')
name_splits[1] = name_splits[1].split('_')[-1]
@ -268,12 +304,12 @@ class AsyncDumpConverter:
idx,
self.convert_tool.common.get_format_string(tensor_format)
)
out_path = os.path.join(self.output_path, out_file_name)
np.save(out_path, dump_data_array)
os.chmod(out_path, stat.S_IRUSR)
return os.path.join(self.output_path, out_file_name)
def _rename_generated_npy_files(self):
"""In order to follow dump naming convention, rename npy files generated by CANN conversion tool."""
"""
In order to follow dump naming convention, rename npy files generated by CANN conversion tool.
"""
target_file_list = []
for in_file in self.files_to_convert:
target_file_list.extend(glob.glob(in_file + "*.npy"))

View File

@ -31,7 +31,7 @@ def get_version():
Function to return offline Debug Services version.
Returns:
version (str): dbgServices version.
version (str): DbgServices version.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
@ -48,7 +48,7 @@ class DbgLogger:
Offline Debug Services Logger
Args:
verbose (bool): whether to print logs.
verbose (bool): Whether to print logs.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
@ -70,8 +70,8 @@ class DbgServices:
Offline Debug Services class.
Args:
dump_file_path (str): directory where the dump files are saved.
verbose (bool): whether to print logs (default: False)..
dump_file_path (str): Directory where the dump files are saved.
verbose (bool): Whether to print logs. Default: False.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services
@ -200,7 +200,7 @@ class DbgServices:
Removing watchpoint from Debug Service instance.
Args:
watchpoint_id (int): Watchpoint id
watchpoint_id (int): Watchpoint id.
Returns:
Debug Service instance with removed watchpoint.
@ -733,17 +733,17 @@ class TensorStatData:
data_size (int): Size of data in bytes.
dtype (int): An encoding representing the type of TensorData.
shape (list): Shape of tensor.
is_bool (bool): Whether the data type is bool
max_value (float): Maximum value in tensor's elements
min_value (float): Minimum value in tensor's elements
avg_value (float): Average value of all tensor's elements
count (int): Number of elements in tensor
neg_zero_count (int): Number of negative elements in tensor
pos_zero_count (int): Number of positive elements in tensor
nan_cout (int): Number of nan elements in tensor
neg_inf_count (int): Number of negative infinity elements in tensor
pos_inf_count (int): Number of positive infinity elements in tensor
zero_count (int): Total number of zero elements in tensor
is_bool (bool): Whether the data type is bool.
max_value (float): Maximum value in tensor's elements.
min_value (float): Minimum value in tensor's elements.
avg_value (float): Average value of all tensor's elements.
count (int): Number of elements in tensor.
neg_zero_count (int): Number of negative elements in tensor.
pos_zero_count (int): Number of positive elements in tensor.
nan_cout (int): Number of nan elements in tensor.
neg_inf_count (int): Number of negative infinity elements in tensor.
pos_inf_count (int): Number of positive infinity elements in tensor.
zero_count (int): Total number of zero elements in tensor.
Examples:
>>> from mindspore.ccsrc.debug.debugger.offline_debug import dbg_services