!28671 Move e2e dump to super kernel Ascend MindRT

Merge pull request !28671 from parastooashtari/ascend_mindrt
This commit is contained in:
i-robot 2022-01-10 08:11:58 +00:00 committed by Gitee
commit 05cd3ca997
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
5 changed files with 43 additions and 19 deletions

View File

@ -30,6 +30,7 @@
#include "backend/kernel_compiler/kernel.h"
#include "debug/data_dump/e2e_dump.h"
#include "utils/config_manager.h"
#include "backend/session/session_basic.h"
constexpr int kFailure = 1;
@ -230,6 +231,40 @@ void LoadDataForDebugger(const KernelGraphPtr &graph_ptr) {
#endif
}
void DumpSetup(const KernelGraphPtr &graph) {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(graph);
E2eDump::DumpSetup(graph.get());
MS_LOG(DEBUG) << "Finish!";
}
void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(graph);
E2eDump::DumpRunIter(graph, rank_id);
E2eDump::DumpData(graph.get(), rank_id);
MS_LOG(DEBUG) << "Finish!";
}
uint32_t GetRankID() {
uint32_t rank_id = 0;
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto env_rank_id = common::GetEnv("RANK_ID");
if (ms_context->get_param<bool>(MS_CTX_ENABLE_HCCL) && !env_rank_id.empty()) {
// get actual rank id if it's distribution training case.
rank_id = GetRankId();
}
return rank_id;
}
void SuperKernelE2eDump(const KernelGraphPtr &graph) {
#ifndef ENABLE_SECURITY
Dump(graph, GetRankID());
DumpSetup(graph);
#endif
}
#ifdef ENABLE_D
int32_t DumpDataCallBack(const DumpChunk *dump_chunk, int32_t size) {
MS_LOG(DEBUG) << "ADX DumpDataCallBack is called";

View File

@ -46,6 +46,8 @@ std::string CheckDatasetSinkMode(const KernelGraphPtr &graph_ptr);
void LoadDataForDebugger(const KernelGraphPtr &graph_ptr);
void SuperKernelE2eDump(const KernelGraphPtr &graph);
#ifdef ENABLE_D
// Callback function to dump ascend async mode
int32_t DumpDataCallBack(const DumpChunk *dump_chunk, int32_t size);

View File

@ -623,6 +623,8 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::
ret = DumpJsonParser::DumpToFile(path, out_tensor->data_c(), host_size, host_shape, host_type);
} else {
auto host_tmp = std::vector<uint8_t>(size_);
BindDevice();
SyncStream();
auto ret_rt_memcpy = aclrtMemcpy(host_tmp.data(), size_, ptr_, size_, ACL_MEMCPY_DEVICE_TO_HOST);
if (ret_rt_memcpy != RT_ERROR_NONE) {
MS_LOG(ERROR) << "SyncDeviceToHost: aclrtMemcpy mem size[" << size_ << "] fail, ret[" << ret_rt_memcpy << "]";

View File

@ -100,6 +100,10 @@ void DebugActor::DebugForGraph(const KernelGraphPtr &graph, const DeviceContext
MS_LOG(DEBUG) << "Super kernel debug for graph: " << graph->graph_id() << ".";
#ifdef ENABLE_DEBUGGER
LoadDataForDebugger(graph);
// This function updates graph history file and cur_dump_iter if dump is enabled.
// When e2e dump is enabled, this function dumps the graph.
SuperKernelE2eDump(graph);
#endif
// Call back to the from actor to process after debug finished.
ActorDispatcher::Send(*from_aid, &DebugAwareActor::OnDebugFinish, op_context);

View File

@ -222,21 +222,6 @@ void DumpInit(uint32_t device_id) {
}
}
}
void DumpSetup(const KernelGraphPtr &graph) {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(graph);
E2eDump::DumpSetup(graph.get());
MS_LOG(DEBUG) << "Finish!";
}
void Dump(const KernelGraphPtr &graph, uint32_t rank_id) {
MS_LOG(DEBUG) << "Start!";
MS_EXCEPTION_IF_NULL(graph);
E2eDump::DumpRunIter(graph, rank_id);
E2eDump::DumpData(graph.get(), rank_id);
MS_LOG(DEBUG) << "Finish!";
}
#endif
void AscendDeviceContext::Initialize() {
@ -569,10 +554,6 @@ bool AscendDeviceContext::ExecuteGraph(const KernelGraphPtr &graph) const {
std::lock_guard<std::mutex> locker(launch_mutex_);
ret = runtime_instance_->RunTask(*graph);
}
#ifndef ENABLE_SECURITY
Dump(graph, GetRankID());
DumpSetup(graph);
#endif
#if defined(_WIN32) || defined(_WIN64)
auto end_time = std::chrono::steady_clock::now();
std::chrono::duration<double, std::ratio<1, kUSecondInSecond>> cost = end_time - start_time;