forked from mindspore-Ecosystem/mindspore
!48325 [Kernel Dump]delete the no overflow operator file on kernel by kernel mode for dump
Merge pull request !48325 from maoyaomin/mym_debugger_kernel_dumper
This commit is contained in:
commit
6f24f09301
|
@ -14,6 +14,7 @@
|
|||
* limitations under the License.
|
||||
*/
|
||||
#include "debug/data_dump/dump_utils.h"
|
||||
#include <dirent.h>
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <algorithm>
|
||||
|
@ -174,4 +175,81 @@ void DumpToFile(const std::string &file_name, const std::string &dump_str) {
|
|||
file.close();
|
||||
ChangeFileMode(real_path_str, S_IRUSR);
|
||||
}
|
||||
|
||||
void RemoveEmptyDir(const std::string &dir_path) {
|
||||
uint32_t dir_count = 0;
|
||||
DIR *d = opendir(dir_path.c_str());
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
std::string name = dir->d_name;
|
||||
if (name == "." || name == "..") {
|
||||
continue;
|
||||
} else {
|
||||
dir_count++;
|
||||
}
|
||||
}
|
||||
(void)closedir(d);
|
||||
if (dir_count == 0) {
|
||||
auto ret = remove(dir_path.c_str());
|
||||
if (ret == 0) {
|
||||
MS_LOG(INFO) << "Delete empty dir successfully, dir path is:" << dir_path;
|
||||
}
|
||||
}
|
||||
}
|
||||
|
||||
void SaveOverflowOperator(const std::string &iterator, const std::string &dump_rank_path) {
|
||||
const std::string overflow_dump_dir = "debug_files";
|
||||
const std::string overflow_file_prefix = "Opdebug.Node_OpDebug.";
|
||||
const std::string cur_step_overflow_path = dump_rank_path + "/" + overflow_dump_dir + "/" + iterator;
|
||||
DIR *d = opendir(cur_step_overflow_path.c_str());
|
||||
overflowOperators.clear();
|
||||
if (d == nullptr) {
|
||||
MS_LOG(WARNING) << "Overflow file directory does not exist!";
|
||||
} else {
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
std::string filename = dir->d_name;
|
||||
if (filename.find(overflow_file_prefix) != std::string::npos) {
|
||||
uint32_t pos_start = overflow_file_prefix.size();
|
||||
uint32_t n = filename.rfind(".") - pos_start + 2;
|
||||
std::string stream_task_name = filename.substr(pos_start - 1, n);
|
||||
overflowOperators.emplace_back(stream_task_name);
|
||||
}
|
||||
}
|
||||
(void)closedir(d);
|
||||
}
|
||||
}
|
||||
|
||||
void DeleteNoOverflowFile(uint32_t rank_id, uint32_t graph_id) {
|
||||
auto &json_parser = DumpJsonParser::GetInstance();
|
||||
if (!(json_parser.async_dump_enabled() || json_parser.e2e_dump_enabled())) {
|
||||
return;
|
||||
}
|
||||
std::string cur_dump_path = json_parser.path() + "/rank_" + std::to_string(rank_id);
|
||||
std::string net_name_ = json_parser.net_name();
|
||||
std::string iterator = std::to_string(json_parser.cur_dump_iter());
|
||||
SaveOverflowOperator(iterator, cur_dump_path);
|
||||
std::string overflow_operator_dump_path =
|
||||
cur_dump_path + "/" + net_name_ + "/" + std::to_string(graph_id) + "/" + iterator;
|
||||
DIR *d = opendir(overflow_operator_dump_path.c_str());
|
||||
if (d == nullptr) {
|
||||
MS_LOG(WARNING) << "Overflow iterator file directory does not exist!";
|
||||
} else {
|
||||
struct dirent *dir = nullptr;
|
||||
while ((dir = readdir(d)) != nullptr) {
|
||||
std::string filename = dir->d_name;
|
||||
bool is_exist =
|
||||
std::any_of(std::begin(overflowOperators), std::end(overflowOperators),
|
||||
[&](std::string stream_task_str) { return filename.find(stream_task_str) != std::string::npos; });
|
||||
if (!is_exist) {
|
||||
auto ret = remove((overflow_operator_dump_path + "/" + filename).c_str());
|
||||
if (ret == 0) {
|
||||
MS_LOG(INFO) << "Delete file successfully, filename is:" << filename.c_str();
|
||||
}
|
||||
}
|
||||
}
|
||||
(void)closedir(d);
|
||||
RemoveEmptyDir(overflow_operator_dump_path);
|
||||
}
|
||||
}
|
||||
} // namespace mindspore
|
||||
|
|
|
@ -18,6 +18,7 @@
|
|||
#define MINDSPORE_MINDSPORE_CCSRC_DEBUG_DATA_DUMP_DUMP_UTILS_H_
|
||||
|
||||
#include <map>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
|
||||
|
@ -30,6 +31,7 @@ using DeviceTensorPtr = std::shared_ptr<DeviceTensor>;
|
|||
namespace mindspore {
|
||||
static const size_t PARAMETER_OUTPUT_INDEX = 0;
|
||||
static const size_t VALUE_NODE_OUTPUT_INDEX = 0;
|
||||
static std::vector<std::string> overflowOperators;
|
||||
|
||||
std::string GenerateDumpPath(uint32_t graph_id, uint32_t rank_id = 0, bool is_cst = false);
|
||||
|
||||
|
@ -43,6 +45,12 @@ const DeviceTensorPtr GetParameterInfo(const AnfNodePtr &node, NotNull<ShapeVect
|
|||
|
||||
void DumpMemToFile(const std::string &file_path, const device::DeviceAddress &addr, const ShapeVector &int_shapes,
|
||||
const TypeId &type, bool trans_flag = false);
|
||||
|
||||
void RemoveEmptyDir(const std::string &dir_path);
|
||||
|
||||
void SaveOverflowOperator(const std::string &iterator, const std::string &dump_rank_path);
|
||||
|
||||
void DeleteNoOverflowFile(uint32_t rank_id, uint32_t graph_id);
|
||||
BACKEND_EXPORT std::string GetOpNameWithoutScope(const std::string &fullname_with_scope,
|
||||
const std::string &separator = "--");
|
||||
|
||||
|
|
|
@ -658,7 +658,8 @@ bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32
|
|||
void E2eDump::DumpParametersData(uint32_t rank_id, const Debugger *debugger) {
|
||||
uint32_t root_graph_id = debugger->GetCurrentRootGraphId();
|
||||
auto &dump_json_parser = DumpJsonParser::GetInstance();
|
||||
if (dump_json_parser.async_dump_enabled() && !debugger->GetAscendKernelByKernelFlag()) {
|
||||
if ((dump_json_parser.async_dump_enabled() && !debugger->GetAscendKernelByKernelFlag()) ||
|
||||
(dump_json_parser.async_dump_enabled() && dump_json_parser.op_debug_mode() > 0)) {
|
||||
// Dump parameters for mindRT in async dump only for kernel by kernel mode.
|
||||
return;
|
||||
}
|
||||
|
|
|
@ -28,6 +28,8 @@ std::map<std::string, std::shared_ptr<OverflowDumper>> &OverflowDumper::GetInsta
|
|||
return instance_map;
|
||||
}
|
||||
|
||||
void OverflowDumper::Clear() { GetInstanceMap().clear(); }
|
||||
|
||||
std::shared_ptr<OverflowDumper> OverflowDumper::GetInstance(const std::string &name) noexcept {
|
||||
if (auto iter = GetInstanceMap().find(name); iter != GetInstanceMap().end()) {
|
||||
return iter->second;
|
||||
|
|
|
@ -34,6 +34,7 @@ class BACKEND_EXPORT OverflowDumper {
|
|||
|
||||
static std::shared_ptr<OverflowDumper> GetInstance(const std::string &name) noexcept;
|
||||
static bool Register(const std::string &name, const std::shared_ptr<OverflowDumper> &instance);
|
||||
static void Clear();
|
||||
virtual void OpLoadDumpInfo(const CNodePtr &kernel) = 0;
|
||||
virtual void Init() = 0;
|
||||
virtual void OpDebugRegisterForStream(const CNodePtr &kernel) = 0;
|
||||
|
|
|
@ -200,8 +200,10 @@ void AscendKernelRuntime::ClearGraphModelMap() {
|
|||
graph_data_dumper_.clear();
|
||||
// tell users which dump kernel name not used
|
||||
DumpJsonParser::GetInstance().PrintUnusedKernel();
|
||||
KernelDumper kernel_dumper;
|
||||
kernel_dumper.OpDebugUnregisterForStream();
|
||||
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
|
||||
KernelDumper kernel_dumper;
|
||||
kernel_dumper.OpDebugUnregisterForStream();
|
||||
}
|
||||
#endif
|
||||
|
||||
graph_kernel_events_map_.clear();
|
||||
|
|
|
@ -436,18 +436,14 @@ void KernelDumper::OpDebugRegisterForStream(const CNodePtr &kernel) {
|
|||
}
|
||||
|
||||
void KernelDumper::OpDebugUnregisterForStream() {
|
||||
uint32_t op_debug_mode = DumpJsonParser::GetInstance().op_debug_mode();
|
||||
if (op_debug_mode == kNoOverflow) {
|
||||
return;
|
||||
}
|
||||
|
||||
for (auto iter = KernelDumper::op_debug_tasks.begin(); iter != KernelDumper::op_debug_tasks.end(); iter++) {
|
||||
rtError_t rt_ret = rtDebugUnRegisterForStream(iter->first);
|
||||
if (rt_ret != RT_ERROR_NONE) {
|
||||
MS_LOG(EXCEPTION) << "[KernelDumperr] Call rtDebugUnRegisterForStream failed, ret = " << rt_ret;
|
||||
MS_LOG(EXCEPTION) << "[KernelDumper] Call rtDebugUnRegisterForStream failed, ret = " << rt_ret;
|
||||
}
|
||||
}
|
||||
KernelDumper::op_debug_tasks.clear();
|
||||
OverflowDumper::Clear();
|
||||
}
|
||||
#endif
|
||||
} // namespace ascend
|
||||
|
|
|
@ -21,7 +21,6 @@
|
|||
#include <set>
|
||||
#include <memory>
|
||||
#include <string>
|
||||
#include <vector>
|
||||
#include <functional>
|
||||
#include "runtime/dev.h"
|
||||
#include "runtime/mem.h"
|
||||
|
@ -88,7 +87,7 @@ class KernelDumper : public debug::OverflowDumper {
|
|||
bool is_op_debug_;
|
||||
uint32_t op_debug_mode_;
|
||||
|
||||
void *dev_load_mem_;
|
||||
void *dev_load_mem_ = nullptr;
|
||||
void *proto_dev_mem_ = nullptr;
|
||||
void *proto_size_dev_mem_ = nullptr;
|
||||
std::mutex register_mutex_;
|
||||
|
|
|
@ -176,31 +176,42 @@ void DebugActor::DebugOnStepBegin(const std::vector<KernelGraphPtr> &graphs,
|
|||
}
|
||||
}
|
||||
if (DumpJsonParser::GetInstance().async_dump_enabled()) {
|
||||
auto kCurLoopCountName = "current_loop_count";
|
||||
for (size_t i = 0; i < graphs.size(); i++) {
|
||||
const auto &graph_ = graphs[i];
|
||||
if (device_contexts[i]->GetDeviceType() != device::DeviceType::kAscend) {
|
||||
continue;
|
||||
}
|
||||
auto device_loop_control_tensors = graph_->device_loop_control_tensors();
|
||||
if (device_loop_control_tensors.count(kCurLoopCountName) == 0) {
|
||||
MS_LOG(WARNING) << "Can't find Device Loop Control Tensor " << kCurLoopCountName;
|
||||
return;
|
||||
}
|
||||
auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
auto *cur_val = static_cast<int32_t *>(tensor->data_c());
|
||||
MS_EXCEPTION_IF_NULL(cur_val);
|
||||
*cur_val = current_step;
|
||||
tensor->set_sync_status(kNeedSyncHostToDevice);
|
||||
auto device_address = tensor->device_address();
|
||||
MS_EXCEPTION_IF_NULL(device_address);
|
||||
if (!device_address->SyncHostToDevice(tensor->shape(), LongToSize(tensor->data().nbytes()), tensor->data_type(),
|
||||
tensor->data_c(), tensor->device_info().host_format_)) {
|
||||
MS_LOG(EXCEPTION) << "SyncHostToDevice failed for device loop control parameter " << kCurLoopCountName;
|
||||
}
|
||||
bool is_data_map_ = false;
|
||||
if (graphs.size() == 1) {
|
||||
const auto &graph_ = graphs[0];
|
||||
KernelGraphPtr kernel_graph = std::dynamic_pointer_cast<session::KernelGraph>(graph_);
|
||||
const auto kernels = kernel_graph->execution_order();
|
||||
is_data_map_ = std::any_of(kernels.cbegin(), kernels.cend(), [](const auto &kernel) {
|
||||
return kernel->fullname_with_scope().find("InitDataSetQueue") != std::string::npos;
|
||||
});
|
||||
}
|
||||
if (!is_data_map_) {
|
||||
auto kCurLoopCountName = "current_loop_count";
|
||||
for (size_t i = 0; i < graphs.size(); i++) {
|
||||
const auto &graph_ = graphs[i];
|
||||
if (device_contexts[i]->GetDeviceType() != device::DeviceType::kAscend) {
|
||||
continue;
|
||||
}
|
||||
auto device_loop_control_tensors = graph_->device_loop_control_tensors();
|
||||
if (device_loop_control_tensors.count(kCurLoopCountName) == 0) {
|
||||
MS_LOG(WARNING) << "Can't find Device Loop Control Tensor " << kCurLoopCountName;
|
||||
return;
|
||||
}
|
||||
auto tensor = device_loop_control_tensors.at(kCurLoopCountName);
|
||||
MS_EXCEPTION_IF_NULL(tensor);
|
||||
auto *cur_val = static_cast<int32_t *>(tensor->data_c());
|
||||
MS_EXCEPTION_IF_NULL(cur_val);
|
||||
*cur_val = current_step;
|
||||
tensor->set_sync_status(kNeedSyncHostToDevice);
|
||||
auto device_address = tensor->device_address();
|
||||
MS_EXCEPTION_IF_NULL(device_address);
|
||||
if (!device_address->SyncHostToDevice(tensor->shape(), LongToSize(tensor->data().nbytes()), tensor->data_type(),
|
||||
tensor->data_c(), tensor->device_info().host_format_)) {
|
||||
MS_LOG(EXCEPTION) << "SyncHostToDevice failed for device loop control parameter " << kCurLoopCountName;
|
||||
}
|
||||
}
|
||||
current_step++;
|
||||
}
|
||||
current_step++;
|
||||
}
|
||||
#endif
|
||||
}
|
||||
|
@ -224,6 +235,17 @@ void DebugActor::DebugOnStepEnd(OpContext<DeviceTensor> *const op_context, const
|
|||
}
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#ifndef ENABLE_SECURITY
|
||||
if (DumpJsonParser::GetInstance().async_dump_enabled() && DumpJsonParser::GetInstance().op_debug_mode() > 0 &&
|
||||
Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
|
||||
uint32_t rank_id = Debugger::GetRankID();
|
||||
uint32_t graph_id = Debugger::GetInstance()->GetCurrentRootGraphId();
|
||||
DeleteNoOverflowFile(rank_id, graph_id);
|
||||
}
|
||||
#endif
|
||||
#endif
|
||||
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
auto debugger = Debugger::GetInstance();
|
||||
if (debugger != nullptr) {
|
||||
|
|
|
@ -22,6 +22,9 @@
|
|||
#include "runtime/graph_scheduler/actor/actor_common.h"
|
||||
#include "runtime/graph_scheduler/device_tensor_store.h"
|
||||
#include "runtime/hardware/device_context.h"
|
||||
#ifdef ENABLE_DEBUGGER
|
||||
#include "debug/data_dump/dump_utils.h"
|
||||
#endif
|
||||
|
||||
namespace mindspore {
|
||||
namespace runtime {
|
||||
|
|
Loading…
Reference in New Issue