!31482 Fix load tensor into mem twice for ascend kernel by kernel dump

Merge pull request !31482 from TinaMengtingZhang/kernel_dump
This commit is contained in:
i-robot 2022-03-26 01:53:33 +00:00 committed by Gitee
commit d40dc4f997
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
18 changed files with 152 additions and 148 deletions

View File

@ -263,6 +263,7 @@ bool DumpJsonParser::DumpToFile(const std::string &filename, const void *data, s
return false;
}
const std::string file_path_str = file_path.value();
MS_LOG(INFO) << "Dump path is " << file_path_str;
ChangeFileMode(file_path_str, S_IWUSR);
std::ofstream fd(file_path_str, std::ios::out | std::ios::trunc | std::ios::binary);
if (!fd.is_open()) {

View File

@ -118,23 +118,23 @@ bool E2eDump::IsDeviceTargetGPU() {
return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
}
bool E2eDump::IsMindRTKernelByKernel() {
return IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag();
}
/*
* Feature group: Dump.
* Target device group: GPU.
* Target device group: GPU, Ascend.
* Runtime category: Old runtime, MindRT.
* Description: This function is for dumping tensor in memory to disk in GPU machine.
* Description: This function is for dumping tensor loaded to tensor_loader in memory to disk in GPU and Ascend machine.
*/
void E2eDump::DumpGPUMemToFile(const Debugger *debugger, const std::string &file_path, bool trans_flag,
const device::DeviceAddress &addr, const std::string &original_kernel_name, size_t slot,
const ShapeVector &int_shapes, const TypeId &host_type) {
void E2eDump::DumpMemFromTensorLoaderToFile(const Debugger *debugger, const std::string &file_path,
const std::string &original_kernel_name, size_t slot) {
#ifdef ENABLE_DEBUGGER
auto format = kOpFormat_DEFAULT;
MS_EXCEPTION_IF_NULL(debugger);
auto ret = debugger->DumpTensorToFile(file_path, trans_flag, format, addr.format(), original_kernel_name, slot,
int_shapes, host_type);
auto ret = debugger->DumpTensorToFile(file_path, original_kernel_name, slot);
if (!ret) {
MS_LOG(INFO) << "DumpTensorToFile Failed: flag:" << trans_flag << ", path:" << file_path
<< ", host_format:" << format;
MS_LOG(INFO) << "DumpTensorToFile Failed: path:" << file_path;
}
#endif
}
@ -184,6 +184,7 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
continue;
}
auto addr = AnfAlgo::GetOutputAddr(node, j);
std::string node_name = GetKernelNodeName(node);
MS_EXCEPTION_IF_NULL(addr);
ShapeVector int_shapes;
GetDumpIntShape(node, j, NOT_NULL(&int_shapes), trans_flag);
@ -196,14 +197,13 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
std::string file_path = dump_path + '/' + op_type + '.' + op_name + '.' + std::to_string(task_id) + '.' +
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".output." +
std::to_string(j);
if (DumpJsonParser::GetInstance().IsStatisticDump() &&
(IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) {
if (DumpJsonParser::GetInstance().IsStatisticDump() && IsMindRTKernelByKernel()) {
TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, false, j, j);
(void)stat_dump.DumpTensorStatsToFile(GetKernelNodeName(node), dump_path, debugger);
(void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
}
if (DumpJsonParser::GetInstance().IsTensorDump()) {
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, GetKernelNodeName(node), j, int_shapes, type);
if (IsMindRTKernelByKernel()) {
DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, j);
} else {
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
}
@ -213,10 +213,8 @@ void E2eDump::DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::s
void E2eDump::DumpOutputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name) {
auto debugger = Debugger::GetInstance();
MS_EXCEPTION_IF_NULL(debugger);
if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend";
if (IsMindRTKernelByKernel()) {
MS_LOG(INFO) << "DumpOutputData is only for graph mode on Ascend";
return;
}
MS_EXCEPTION_IF_NULL(node);
@ -256,8 +254,7 @@ void E2eDump::DumpInput(const session::KernelGraph *graph, const std::string &du
}
}
void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger,
const KernelLaunchInfo *launch_info) {
void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger) {
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (!dump_json_parser.InputNeedDump()) {
return;
@ -269,25 +266,11 @@ void E2eDump::DumpInputSingleNode(const CNodePtr &node, const std::string &dump_
return;
}
DumpJsonParser::GetInstance().MatchKernel(kernel_name);
DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger, launch_info);
}
std::shared_ptr<device::DeviceAddress> CreateAscendDeviceAddress(const KernelLaunchInfo *launch_info, size_t index,
TypeId type) {
MS_EXCEPTION_IF_NULL(launch_info);
auto addr_ptr = launch_info->inputs_[index];
auto ms_context = MsContext::GetInstance();
MS_EXCEPTION_IF_NULL(ms_context);
auto device_id = ms_context->get_param<uint32_t>(MS_CTX_DEVICE_ID);
auto device_context =
device::DeviceContextManager::GetInstance().GetOrCreateDeviceContext({kAscendDevice, device_id});
auto format = kOpFormat_DEFAULT;
MS_EXCEPTION_IF_NULL(addr_ptr);
return device_context->CreateDeviceAddress(addr_ptr->addr, addr_ptr->size, format, type, ShapeVector());
DumpInputImpl(node, trans_flag, dump_path, &kernel_name, debugger);
}
void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, const Debugger *debugger, const KernelLaunchInfo *launch_info) {
std::string *kernel_name, const Debugger *debugger) {
MS_EXCEPTION_IF_NULL(node);
GetFileKernelName(NOT_NULL(kernel_name));
auto input_size = common::AnfAlgo::GetInputTensorNum(node);
@ -298,12 +281,12 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
if (!AnfAlgo::OutputAddrExist(input, index)) {
continue;
}
std::string tensor_name = GetKernelNodeName(node);
std::string node_name = GetKernelNodeName(node);
size_t slot = j;
if (IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
if (IsMindRTKernelByKernel()) {
auto input_kernel = node->input(j + 1);
std::string input_kernel_name = GetKernelNodeName(input_kernel);
tensor_name = input_kernel_name;
node_name = input_kernel_name;
slot = 0;
}
ShapeVector int_shapes;
@ -318,18 +301,13 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
std::to_string(stream_id) + '.' + std::to_string(timestamp) + ".input." + std::to_string(j);
auto addr = AnfAlgo::GetOutputAddr(input, index);
MS_EXCEPTION_IF_NULL(addr);
if (DumpJsonParser::GetInstance().IsStatisticDump() &&
(IsDeviceTargetGPU() || Debugger::GetInstance()->GetAscendKernelByKernelFlag())) {
if (DumpJsonParser::GetInstance().IsStatisticDump() && IsMindRTKernelByKernel()) {
TensorStatDump stat_dump(op_type, op_name, task_id, stream_id, timestamp, true, j, slot);
(void)stat_dump.DumpTensorStatsToFile(tensor_name, dump_path, debugger);
(void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
}
if (DumpJsonParser::GetInstance().IsTensorDump()) {
if (IsDeviceTargetGPU()) {
DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, tensor_name, slot, int_shapes, type);
} else if (Debugger::GetInstance()->GetAscendKernelByKernelFlag()) {
// load address from launch_info when it's Ascend Kernel by kernel mode.
auto ascend_device_addr = CreateAscendDeviceAddress(launch_info, j, type);
DumpMemToFile(file_path, *ascend_device_addr, int_shapes, type, trans_flag);
if (IsMindRTKernelByKernel()) {
DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, slot);
} else {
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
}
@ -339,9 +317,7 @@ void E2eDump::DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::st
void E2eDump::DumpInputData(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name) {
auto debugger = Debugger::GetInstance();
MS_EXCEPTION_IF_NULL(debugger);
if (IsDeviceTargetGPU() || debugger->GetAscendKernelByKernelFlag()) {
if (IsMindRTKernelByKernel()) {
MS_LOG(INFO) << "DumpInputData is only for graph mode on Ascend";
return;
}
@ -409,7 +385,7 @@ void E2eDump::DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_
(void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
}
if (dump_json_parser.IsTensorDump()) {
DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, node_name, 0, int_shapes, type);
DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, 0);
}
} else {
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
@ -452,7 +428,7 @@ void E2eDump::DumpSingleParameterNode(const AnfNodePtr &anf_node, const std::str
(void)stat_dump.DumpTensorStatsToFile(node_name, dump_path, debugger);
}
if (dump_json_parser.IsTensorDump()) {
DumpGPUMemToFile(debugger, file_path, trans_flag, *addr, node_name, 0, int_shapes, type);
DumpMemFromTensorLoaderToFile(debugger, file_path, node_name, 0);
}
} else {
DumpMemToFile(file_path, *addr, int_shapes, type, trans_flag);
@ -662,13 +638,12 @@ void E2eDump::DumpData(const session::KernelGraph *graph, uint32_t rank_id, cons
* Runtime category: MindRT.
* Description: This function is for dumping a single node. It is used for mindrt in GPU and Ascend kernel-by-kernel.
*/
bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger,
const KernelLaunchInfo *launch_info) {
bool E2eDump::DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id, const Debugger *debugger) {
bool success = false;
auto &dump_json_parser = DumpJsonParser::GetInstance();
if (dump_json_parser.DumpEnabledForIter()) {
std::string dump_path = GenerateDumpPath(graph_id, rank_id);
DumpInputSingleNode(node, dump_path, debugger, launch_info);
DumpInputSingleNode(node, dump_path, debugger);
DumpOutputSingleNode(node, dump_path, debugger);
success = true;
}
@ -761,9 +736,10 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
if (dump_tensor_vec.empty()) {
return;
}
// The maximum tensor size to allow convert format in single thread to 1 MB.
constexpr int kMaxTensorSize = 1048576;
if (offset <= kMaxTensorSize) {
// If the total tensor size is less than 1Mb, do it in single thread.
// If the total tensor size is less than 1MB, do it in single thread.
ConvertFormatForTensors(&dump_tensor_vec, 0, dump_tensor_vec.size() - 1);
} else {
// In multi_thread process, we only use 1/4 of the total concurrent threads.
@ -775,7 +751,7 @@ void E2eDump::DumpTensorToFile(const std::string &dump_path, const debugger::dum
std::vector<std::thread> threads;
threads.reserve(num_threads);
MS_LOG(INFO) << "Number of threads used for A+M dump: " << num_threads;
for (size_t t = 0; t < threads.capacity(); t++) {
for (size_t t = 0; t < num_threads; t++) {
uint32_t start_idx = t * task_size;
uint32_t end_idx = start_idx + task_size - 1;
if (t == num_threads - 1) {

View File

@ -32,7 +32,6 @@
#endif
#include "include/backend/visible.h"
using mindspore::kernel::KernelLaunchInfo;
#ifndef ENABLE_DEBUGGER
class Debugger;
#endif
@ -71,12 +70,11 @@ class E2eDump {
static void DumpParametersData(uint32_t rank_id, const Debugger *debugger);
static bool DumpSingleNodeData(const CNodePtr &node, uint32_t graph_id, uint32_t rank_id,
const Debugger *debugger = nullptr, const KernelLaunchInfo *launch_info = nullptr);
const Debugger *debugger = nullptr);
// Dump data when task error.
static void DumpInputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, const Debugger *debugger,
const KernelLaunchInfo *launch_info = nullptr);
std::string *kernel_name, const Debugger *debugger);
static void DumpOutputImpl(const CNodePtr &node, bool trans_flag, const std::string &dump_path,
std::string *kernel_name, const Debugger *debugger);
@ -93,6 +91,10 @@ class E2eDump {
char *data_ptr);
#endif
static bool IsDeviceTargetGPU();
static bool IsMindRTKernelByKernel();
private:
static void DumpOutput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);
@ -100,15 +102,13 @@ class E2eDump {
static void DumpInput(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);
static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger,
const KernelLaunchInfo *launch_info = nullptr);
static void DumpInputSingleNode(const CNodePtr &node, const std::string &dump_path, const Debugger *debugger);
static void DumpParameters(const session::KernelGraph *graph, const std::string &dump_path, const Debugger *debugger);
static void DumpGPUMemToFile(const Debugger *debugger, const std::string &file_path, bool trans_flag,
const device::DeviceAddress &addr, const std::string &original_kernel_name, size_t slot,
const ShapeVector &int_shapes, const TypeId &host_type);
static bool IsDeviceTargetGPU();
static void DumpMemFromTensorLoaderToFile(const Debugger *debugger, const std::string &file_path,
const std::string &original_kernel_name, size_t slot);
static void DumpSingleAnfNode(const AnfNodePtr &anf_node, const size_t output_index, const std::string &dump_path,
bool trans_flag, const Debugger *debugger);

View File

@ -33,11 +33,6 @@ constexpr auto kCsvFileName = "statistic.csv";
} // namespace
namespace mindspore {
const std::map<DbgDataType, std::string> kDbgDataTypeToStringMap = {
{DT_BOOL, "bool"}, {DT_INT8, "int8"}, {DT_INT16, "int16"}, {DT_INT32, "int32"},
{DT_INT64, "int64"}, {DT_UINT8, "uint8"}, {DT_UINT16, "uint16"}, {DT_UINT32, "uint32"},
{DT_UINT64, "uint64"}, {DT_FLOAT16, "float16"}, {DT_FLOAT32, "float32"}, {DT_FLOAT64, "float64"}};
bool CsvWriter::OpenFile(const std::string &path, const std::string &header) {
if (file_.is_open() && path == file_path_str_) {
return true;
@ -162,13 +157,10 @@ bool TensorStatDump::DumpTensorStatsToFile(const std::string &dump_path, const s
MS_LOG(INFO) << "Tensor data is empty, skipping current statistics";
return false;
}
std::string type;
auto iter_type = kDbgDataTypeToStringMap.find(data->GetType());
if (iter_type == kDbgDataTypeToStringMap.end()) {
std::string type = data->GetTypeString();
if (type.empty()) {
type = "unsupported(" + std::to_string(data->GetType()) + ")";
MS_LOG(INFO) << "Unsupported tensor data_type " << type << " for tensor " << data->GetName();
} else {
type = iter_type->second;
}
if (!OpenStatisticsFile(dump_path)) {
return false;

View File

@ -1836,11 +1836,8 @@ std::shared_ptr<TensorData> DebugServices::GetTensor(const std::string &tensor_n
void DebugServices::EmptyCurrentTensor() { tensor_loader_->EmptyCurrentTensor(); }
#ifdef ONLINE_DBG_MODE
bool DebugServices::DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
const std::string &addr_format, const std::string &tensor_name, size_t slot,
const std::vector<int64_t> &host_shape, TypeId host_type) const {
return tensor_loader_->DumpTensorToFile(filepath, trans_flag, host_fmt, addr_format, tensor_name, slot, host_shape,
host_type);
bool DebugServices::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
return tensor_loader_->DumpTensorToFile(filepath, tensor_name, slot);
}
#endif

View File

@ -461,9 +461,7 @@ class DebugServices {
void EmptyCurrentTensor();
#ifdef ONLINE_DBG_MODE
bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
const std::string &addr_format, const std::string &tensor_name, size_t slot,
const std::vector<int64_t> &host_shape, TypeId host_type) const;
bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const;
#endif
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);

View File

@ -527,10 +527,10 @@ void Debugger::DumpConstantDataAscend(const KernelGraphPtr &graph) {
* Runtime category: MindRT.
* Description: Dumps a single node for given graph_id.
*/
void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id, const KernelLaunchInfo *launch_info) {
void Debugger::DumpSingleNode(const CNodePtr &node, uint32_t graph_id) {
if (debugger_ && debugger_->DebuggerBackendEnabled()) {
uint32_t rank_id = GetRankID();
(void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get(), launch_info);
(void)E2eDump::DumpSingleNodeData(node, graph_id, rank_id, debugger_.get());
}
}
@ -1335,11 +1335,8 @@ void Debugger::SendWatchpoints(const std::list<WatchpointHit> &points) {
}
}
bool Debugger::DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
const std::string &addr_format, const std::string &tensor_name, size_t slot,
const std::vector<int64_t> &host_shape, TypeId host_type) const {
return debug_services_.get()->DumpTensorToFile(filepath, trans_flag, host_fmt, addr_format, tensor_name, slot,
host_shape, host_type);
bool Debugger::DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const {
return debug_services_.get()->DumpTensorToFile(filepath, tensor_name, slot);
}
bool Debugger::LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev) {
@ -1541,7 +1538,8 @@ void Debugger::LoadSingleAnfnode(const AnfNodePtr &anf_node, const size_t output
} else {
keep_prev = false;
}
bool ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false);
bool ret =
addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, 0, keep_prev, root_graph_id, false, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@ -1572,7 +1570,7 @@ void Debugger::LoadSingleParameterMindRT(const AnfNodePtr &node) {
}
// Keep_prev is True for parameters.
// force update for parameters.
bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true);
bool ret = device_addr->LoadMemToHost(tensor_name, 0, format, int_shapes, type, 0, true, root_graph_id, true, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
@ -1702,7 +1700,8 @@ void Debugger::LoadGraphOutputs() {
auto format = kOpFormat_DEFAULT;
string tensor_name = kernel_name + ':' + std::to_string(j);
ShapeVector int_shapes = trans::GetRuntimePaddingShape(node, j);
auto ret = addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false);
auto ret =
addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, root_graph_id, false, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";

View File

@ -107,7 +107,7 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {
void DumpConstantDataAscend(const KernelGraphPtr &graph);
void DumpSingleNode(const CNodePtr &node, uint32_t graph_id, const KernelLaunchInfo *launch_info = nullptr);
void DumpSingleNode(const CNodePtr &node, uint32_t graph_id);
void DumpInGraphCompiler(const KernelGraphPtr &kernel_graph);
@ -117,9 +117,7 @@ class BACKEND_EXPORT Debugger : public std::enable_shared_from_this<Debugger> {
void PostExecuteNode(const CNodePtr &kernel, bool last_kernel);
bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
const std::string &addr_format, const std::string &tensor_name, size_t slot,
const std::vector<int64_t> &host_shape, TypeId host_type) const;
bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) const;
bool LoadNewTensor(const std::shared_ptr<TensorData> &tensor, bool keep_prev);

View File

@ -66,12 +66,12 @@ std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Target device group: GPU, Ascend.
* Runtime category: MindRT.
* Description: Get kernel inputs from launch_info and load the inputs from device to host.
*/
void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, uint32_t root_graph_id,
const DeviceContext *device_context) {
const DeviceContext *device_context, const bool trans_flag) {
// get inputs
auto kernel_inputs = launch_info->inputs_;
auto input_size = common::AnfAlgo::GetInputTensorNum(cnode);
@ -79,33 +79,40 @@ void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint
auto input_kernel = cnode->input(j + 1);
std::string input_kernel_name = GetKernelNodeName(input_kernel);
auto addr = kernel_inputs[j];
auto type = common::AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
auto device_type = AnfAlgo::GetOutputDeviceDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
auto host_type = common::AnfAlgo::GetOutputInferDataType(input_kernel, PARAMETER_OUTPUT_INDEX);
auto type = trans_flag ? host_type : device_type;
// For example, this happens with the Depend op
if (type == kMetaTypeNone) {
continue;
}
auto format = kOpFormat_DEFAULT;
auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector());
auto host_format = kOpFormat_DEFAULT;
auto device_format =
E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(input_kernel, PARAMETER_OUTPUT_INDEX);
auto device_addr =
device_context->CreateDeviceAddress(addr->addr, addr->size, device_format, device_type, ShapeVector());
string input_tensor_name = input_kernel_name + ':' + "0";
ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), format, int_shapes, type, 0, true,
root_graph_id, false);
ShapeVector int_shapes;
GetDumpIntShape(input_kernel, PARAMETER_OUTPUT_INDEX, NOT_NULL(&int_shapes), trans_flag);
auto ret = device_addr->LoadMemToHost(input_tensor_name, UintToInt(exec_order), host_format, int_shapes, type, 0,
true, root_graph_id, false, trans_flag);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
<< ", tensor_name:" << input_tensor_name << ", host_format:" << host_format
<< ", device_format:" << device_format << ".";
}
}
}
/*
* Feature group: Dump, Online debugger.
* Target device group: GPU.
* Target device group: GPU, Ascend.
* Runtime category: MindRT.
* Description: Get kernel outputs from launch_info and load the inputs from device to host.
*/
void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
uint32_t root_graph_id, const DeviceContext *device_context) {
uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag) {
// get outputs
auto kernel_outputs = launch_info->outputs_;
auto output_size = common::AnfAlgo::GetOutputTensorNum(cnode);
@ -115,21 +122,27 @@ void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uin
for (size_t j : real_outputs) {
auto addr = kernel_outputs[j];
auto type = common::AnfAlgo::GetOutputInferDataType(cnode, j);
auto device_type = AnfAlgo::GetOutputDeviceDataType(cnode, j);
auto host_type = common::AnfAlgo::GetOutputInferDataType(cnode, j);
auto type = trans_flag ? host_type : device_type;
// For example, this happens with the Depend op
if (type == kMetaTypeNone) {
continue;
}
auto format = kOpFormat_DEFAULT;
auto device_addr = device_context->CreateDeviceAddress(addr->addr, addr->size, format, type, ShapeVector());
auto host_format = kOpFormat_DEFAULT;
auto device_format = E2eDump::IsDeviceTargetGPU() ? kOpFormat_DEFAULT : AnfAlgo::GetOutputFormat(cnode, j);
auto device_addr =
device_context->CreateDeviceAddress(addr->addr, addr->size, device_format, device_type, ShapeVector());
string tensor_name = kernel_name + ':' + std::to_string(j);
ShapeVector int_shapes = trans::GetRuntimePaddingShape(cnode, j);
auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), format, int_shapes, type, j, false,
root_graph_id, false);
ShapeVector int_shapes;
GetDumpIntShape(cnode, j, NOT_NULL(&int_shapes), trans_flag);
auto ret = device_addr->LoadMemToHost(tensor_name, UintToInt(exec_order), host_format, int_shapes, type, j, false,
root_graph_id, false, trans_flag);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";
<< ", tensor_name:" << tensor_name << ", host_format:" << host_format
<< ", device_format:" << device_format << ".!";
}
}
}
@ -168,6 +181,13 @@ bool IsDeviceTargetGPU() {
return context->get_param<std::string>(MS_CTX_DEVICE_TARGET) == kGPUDevice;
}
bool GetTransFlag() {
if (Debugger::GetInstance()->debugger_enabled() || IsDeviceTargetGPU()) {
return true;
}
return DumpJsonParser::GetInstance().trans_flag();
}
/*
* Feature group: Dump, Online debugger.
* Target device group: Ascend, GPU.
@ -187,11 +207,12 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
auto kernel_graph = std::dynamic_pointer_cast<KernelGraph>(cnode->func_graph());
MS_EXCEPTION_IF_NULL(kernel_graph);
auto root_graph_id = kernel_graph->root_graph_id();
bool trans_flag = GetTransFlag();
if (debugger->debugger_enabled() || dump_json_parser.InputNeedDump()) {
LoadInputs(cnode, launch_info, exec_order, root_graph_id, device_context);
LoadInputs(cnode, launch_info, exec_order, root_graph_id, device_context, trans_flag);
}
if (debugger->debugger_enabled() || dump_json_parser.OutputNeedDump()) {
LoadOutputs(cnode, launch_info, exec_order, root_graph_id, device_context);
LoadOutputs(cnode, launch_info, exec_order, root_graph_id, device_context, trans_flag);
}
// Dump kernel
if (dump_enabled) {
@ -202,7 +223,7 @@ void ReadDataAndDump(const CNodePtr &cnode, const KernelLaunchInfo *launch_info,
debugger->DumpSingleNode(cnode, graph_id);
} else {
// for Ascend, node are dumped in root_graph_id directory.
debugger->DumpSingleNode(cnode, root_graph_id, launch_info);
debugger->DumpSingleNode(cnode, root_graph_id);
}
// Clear Dumped data when online debugger is not enabled
if (!debugger->debugger_enabled()) {

View File

@ -33,10 +33,10 @@ namespace mindspore {
std::vector<size_t> CheckRealOutput(const std::string &node_name, const size_t &output_size);
void LoadInputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order, uint32_t root_graph_id,
const DeviceContext *device_context);
const DeviceContext *device_context, const bool trans_flag);
void LoadOutputs(const CNodePtr &cnode, const KernelLaunchInfo *launch_info, uint32_t exec_order,
uint32_t root_graph_id, const DeviceContext *device_context);
uint32_t root_graph_id, const DeviceContext *device_context, const bool trans_flag);
bool CheckReadData(const CNodePtr &cnode);

View File

@ -17,6 +17,7 @@
#define MINDSPORE_CCSRC_DEBUG_TENSOR_DATA_H_
#include <algorithm>
#include <map>
#include <vector>
#include <string>
#include <iostream>
@ -203,6 +204,10 @@ class TensorData {
#ifdef ONLINE_DBG_MODE
void SetTensor(const mindspore::tensor::TensorPtr &out_tensor) { this->tensor_ptr_ = out_tensor; }
void SetFormat(const std::string &format) { this->format_ = format; }
std::string GetFormat() { return this->format_; }
#endif
void SetSlot(size_t slot) { this->slot_ = slot; }
@ -239,6 +244,19 @@ class TensorData {
DbgDataType GetType() const { return this->data_type_; }
std::string GetTypeString() const {
const std::map<DbgDataType, std::string> kDbgDataTypeToStringMap = {
{DT_BOOL, "bool"}, {DT_INT8, "int8"}, {DT_INT16, "int16"}, {DT_INT32, "int32"},
{DT_INT64, "int64"}, {DT_UINT8, "uint8"}, {DT_UINT16, "uint16"}, {DT_UINT32, "uint32"},
{DT_UINT64, "uint64"}, {DT_FLOAT16, "float16"}, {DT_FLOAT32, "float32"}, {DT_FLOAT64, "float64"}};
auto iter_type = kDbgDataTypeToStringMap.find(data_type_);
if (iter_type == kDbgDataTypeToStringMap.end()) {
return std::string();
} else {
return iter_type->second;
}
}
void SetType(unsigned int type) { ConvertMsToDbgType(type); }
void SetType(const std::string &type_name) { ConvertStringToDbgType(type_name); }
@ -438,6 +456,7 @@ class TensorData {
std::string time_stamp_;
#ifdef ONLINE_DBG_MODE
std::string format_{""};
mindspore::tensor::TensorPtr tensor_ptr_{nullptr};
#endif
};

View File

@ -244,29 +244,20 @@ class TensorLoader {
* Runtime category: Old runtime, MindRT.
* Description: Load tensor data from debugger backend cache (tensor_list_map_) and dump to file in npy format.
*/
bool DumpTensorToFile(const std::string &filepath, bool trans_flag, const std::string &host_fmt,
const std::string &addr_format, const std::string &tensor_name, size_t slot,
const std::vector<int64_t> &host_shape, TypeId host_type) {
bool DumpTensorToFile(const std::string &filepath, const std::string &tensor_name, size_t slot) {
if (filepath.empty()) {
MS_LOG(ERROR) << "Dump file path is null!";
return false;
}
std::string path = "";
if (trans_flag) {
path = filepath + '.' + host_fmt;
} else {
path = filepath + '.' + addr_format;
}
MS_LOG(INFO) << "Dump path is " << path;
std::string tensor_loader_name = tensor_name + ":" + std::to_string(slot);
auto iter = tensor_list_map_.find(tensor_loader_name);
if (iter != tensor_list_map_.end()) {
std::shared_ptr<TensorData> node = iter->second;
size_t host_size = node->GetByteSize();
std::string path = filepath + '.' + node->GetFormat();
return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), host_size, host_shape, host_type);
return DumpJsonParser::DumpToFile(path, node->GetDataPtr(), node->GetByteSize(), node->GetShape(),
StringToTypeId(node->GetTypeString()));
}
MS_LOG(INFO) << "Tensor name:" << tensor_name << " not found in tensor_list_map_";
return false;

View File

@ -647,9 +647,10 @@ bool AscendDeviceAddress::DumpMemToFile(const std::string &filepath, const std::
* Runtime category: Old runtime, MindRT.
* Description: Load tensor to host and create tensor_data object for the loaded tensor.
*/
bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
uint32_t root_graph_id, bool force_update) const {
bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order,
const std::string &host_fmt, const ShapeVector &host_shape, TypeId host_type,
size_t slot, bool keep_prev, uint32_t root_graph_id, bool force_update,
bool trans_flag) const {
bool ret = false;
auto debugger = Debugger::GetInstance();
MS_EXCEPTION_IF_NULL(debugger);
@ -671,9 +672,14 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
mindspore::tensor::TensorPtr out_tensor = std::make_shared<tensor::Tensor>(host_type, host_shape);
MS_EXCEPTION_IF_NULL(out_tensor);
size_t host_size = out_tensor->data().nbytes();
auto ret_sync = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
bool ret_sync = false;
if (trans_flag) {
ret_sync = SyncDeviceToHost(host_shape, host_size, host_type, out_tensor->data_c());
} else {
ret_sync = SyncDeviceToHost(host_size, out_tensor->data_c());
}
if (!ret_sync) {
MS_LOG(ERROR) << "Copy device mem to host failed";
MS_LOG(ERROR) << "Convert format or Copy device mem to host failed";
return ret;
}
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
@ -683,7 +689,11 @@ bool AscendDeviceAddress::LoadMemToHost(const std::string &tensor_name, int exec
tensor_data->SetType((unsigned int)host_type);
tensor_data->SetShape(out_tensor->shape());
tensor_data->SetRootGraphId(root_graph_id);
std::string tensor_format = trans_flag ? host_fmt : format_;
tensor_data->SetFormat(tensor_format);
ret = debugger->LoadNewTensor(tensor_data, keep_prev);
MS_LOG(INFO) << "Load tensor '" << tensor_name << "' into debugger tensor loader successfully: format("
<< tensor_format << ")";
return ret;
}
#endif

View File

@ -62,7 +62,7 @@ class AscendDeviceAddress : public DeviceAddress {
#ifdef ENABLE_DEBUGGER
bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
uint32_t root_graph_id, bool force_update) const override;
uint32_t root_graph_id, bool force_update, bool trans_flag) const override;
#endif
private:

View File

@ -185,7 +185,7 @@ GPUDeviceAddress::~GPUDeviceAddress() { ClearDeviceMemory(); }
#ifdef ENABLE_DEBUGGER
bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
uint32_t root_graph_id, bool force_update) const {
uint32_t root_graph_id, bool force_update, bool) const {
bool ret = false;
if (size_ == 0) {
return true;
@ -219,6 +219,7 @@ bool GPUDeviceAddress::LoadMemToHost(const std::string &tensor_name, int executi
tensor_data->SetType((unsigned int)host_type);
tensor_data->SetShape(out_tensor->shape());
tensor_data->SetRootGraphId(root_graph_id);
tensor_data->SetFormat(host_fmt);
ret = Debugger::GetInstance()->LoadNewTensor(tensor_data, keep_prev);
MS_LOG(INFO) << "E2E tensor name is " << tensor_name;
return ret;

View File

@ -56,7 +56,7 @@ class GPUDeviceAddress : public DeviceAddress {
#ifdef ENABLE_DEBUGGER
bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
uint32_t root_graph_id, bool force_update) const override;
uint32_t root_graph_id, bool force_update, bool trans_flag) const override;
#endif
private:

View File

@ -183,7 +183,8 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
string input_tensor_name = input_kernel_name + ':' + "0";
ShapeVector int_shapes = trans::GetRuntimePaddingShape(input_kernel, PARAMETER_OUTPUT_INDEX);
auto ret = gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true, 0, false);
auto ret =
gpu_addr->LoadMemToHost(input_tensor_name, exec_order, format, int_shapes, type, 0, true, 0, false, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << input_tensor_name << ", host_format:" << format << ".!";
@ -210,7 +211,7 @@ void LoadKernelData(Debugger *debugger, const CNodePtr &kernel,
auto gpu_addr = std::make_unique<GPUDeviceAddress>(addr->addr, addr->size, format, type);
string tensor_name = kernel_name + ':' + std::to_string(j);
ShapeVector int_shapes = trans::GetRuntimePaddingShape(kernel, j);
auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, 0, false);
auto ret = gpu_addr->LoadMemToHost(tensor_name, exec_order, format, int_shapes, type, j, false, 0, false, true);
if (!ret) {
MS_LOG(ERROR) << "LoadMemToHost:"
<< ", tensor_name:" << tensor_name << ", host_format:" << format << ".!";

View File

@ -141,7 +141,7 @@ class DeviceAddress : public mindspore::DeviceSync {
#ifdef ENABLE_DEBUGGER
virtual bool LoadMemToHost(const std::string &tensor_name, int execution_order, const std::string &host_fmt,
const ShapeVector &host_shape, TypeId host_type, size_t slot, bool keep_prev,
uint32_t root_graph_id, bool force_update) const {
uint32_t root_graph_id, bool force_update, bool trans_flag) const {
return true;
}
#endif