forked from mindspore-Ecosystem/mindspore
!42312 [profiler] pynative 模式profiler优化;提供生成PMU&l2_cache性能数据的接口
Merge pull request !42312 from zangqx/m_Q3
This commit is contained in:
commit
b3adb8f823
|
@ -38,15 +38,11 @@
|
|||
#include "debug/data_dump/e2e_dump.h"
|
||||
#include "debug/debugger/debugger_utils.h"
|
||||
#include "plugin/device/ascend/hal/profiler/memory_profiling.h"
|
||||
#include "plugin/device/ascend/hal/device/profiling/profiling_manager.h"
|
||||
#include "utils/anf_utils.h"
|
||||
#include "plugin/device/ascend/hal/profiler/pynative_profiling.h"
|
||||
#include "plugin/device/ascend/hal/profiler/ascend_profiling.h"
|
||||
#include "plugin/device/ascend/hal/device/dump/ascend_dump.h"
|
||||
|
||||
using Adx::AdxRegDumpProcessCallBack;
|
||||
using mindspore::device::ascend::ProfilingManager;
|
||||
using mindspore::profiler::ProfilerManager;
|
||||
using mindspore::profiler::ascend::MemoryProfiling;
|
||||
#endif
|
||||
|
||||
|
@ -416,31 +412,17 @@ bool AscendKernelExecutor::LaunchKernel(const CNodePtr &kernel, const vector<Add
|
|||
if (nop_op_to_memcpy_.find(kernel) != nop_op_to_memcpy_.end()) {
|
||||
(void)MemoryCopyAsync(kernel, real_inputs, outputs);
|
||||
} else {
|
||||
#ifndef ENABLE_SECURITY
|
||||
auto profiler_inst = profiler::ascend::PynativeProfiler::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(profiler_inst);
|
||||
std::thread::id t_id = std::this_thread::get_id();
|
||||
profiler_inst->OpDataProducerBegin(res_manager_->runtime_instance_, stream, t_id, kernel->fullname_with_scope(),
|
||||
is_dynamic_shape);
|
||||
#endif
|
||||
MS_LOG(DEBUG) << "Begin launch kernel: " << kernel->fullname_with_scope();
|
||||
ret = kernel_mod->Launch(real_inputs, workspace, outputs, stream);
|
||||
MS_LOG(DEBUG) << "End launch kernel: " << kernel->fullname_with_scope();
|
||||
#ifndef ENABLE_SECURITY
|
||||
profiler_inst->OpDataProducerEnd(t_id, is_dynamic_shape);
|
||||
#endif
|
||||
if (!ret) {
|
||||
MS_LOG(ERROR) << "Launch kernel failed, kernel full name: " << kernel->fullname_with_scope();
|
||||
return false;
|
||||
}
|
||||
}
|
||||
auto ascend_instance = profiler::ascend::AscendProfiler::GetInstance();
|
||||
auto profiler_manage_instance = profiler::ProfilerManager::GetInstance();
|
||||
MS_EXCEPTION_IF_NULL(ascend_instance);
|
||||
MS_EXCEPTION_IF_NULL(profiler_manage_instance);
|
||||
if ((profiler_manage_instance->GetNetDynamicShapeStatus() ||
|
||||
ms_context->get_param<int>(MS_CTX_EXECUTION_MODE) == kGraphMode) &&
|
||||
ascend_instance->GetEnableFlag()) {
|
||||
if (ascend_instance->GetEnableFlag()) {
|
||||
ascend_instance->GetNodeTaskIdStreamId(kernel, graph_id, UintToInt(device_id), kernel_type);
|
||||
}
|
||||
|
||||
|
|
|
@ -41,13 +41,13 @@ namespace {
|
|||
PROFILER_REG(kAscendDevice, AscendProfiler);
|
||||
} // namespace
|
||||
|
||||
std::map<std::string, aclprofAicoreMetrics> kAicMetrics{
|
||||
{"ArithmeticUtilization", ACL_AICORE_ARITHMETIC_UTILIZATION},
|
||||
{"PipeUtilization", ACL_AICORE_PIPE_UTILIZATION},
|
||||
{"Memory", ACL_AICORE_MEMORY_BANDWIDTH},
|
||||
{"MemoryLO", ACL_AICORE_L0B_AND_WIDTH},
|
||||
{"ResourceConflictRatio", ACL_AICORE_RESOURCE_CONFLICT_RATIO},
|
||||
};
|
||||
std::map<std::string, aclprofAicoreMetrics> kAicMetrics{{"ArithmeticUtilization", ACL_AICORE_ARITHMETIC_UTILIZATION},
|
||||
{"PipeUtilization", ACL_AICORE_PIPE_UTILIZATION},
|
||||
{"Memory", ACL_AICORE_MEMORY_BANDWIDTH},
|
||||
{"MemoryLO", ACL_AICORE_L0B_AND_WIDTH},
|
||||
{"ResourceConflictRatio", ACL_AICORE_RESOURCE_CONFLICT_RATIO},
|
||||
{"MemoryUB", ACL_AICORE_MEMORY_UB},
|
||||
{"None", ACL_AICORE_NONE}};
|
||||
|
||||
std::shared_ptr<AscendProfiler> AscendProfiler::GetInstance() {
|
||||
auto instance = Profiler::GetInstance(kAscendDevice);
|
||||
|
@ -120,7 +120,9 @@ uint64_t AscendProfiler::GetOptionsMask() const {
|
|||
if (options_json["hccl"] == "on") {
|
||||
mask |= ACL_PROF_HCCL_TRACE;
|
||||
}
|
||||
|
||||
if (options_json["l2_cache"] == "on") {
|
||||
mask |= ACL_PROF_L2CACHE;
|
||||
}
|
||||
return mask;
|
||||
}
|
||||
|
||||
|
|
|
@ -472,7 +472,7 @@ class FlopsParser:
|
|||
|
||||
if not os.path.exists(_step_trace_file_path):
|
||||
logger.critical(f'The {_step_trace_file_path} file does not exist.')
|
||||
raise ProfilerFileNotFoundException(_step_trace_file_path)
|
||||
return op_all_step_time, op_all_step_comp
|
||||
try:
|
||||
with open(_step_trace_file_path, 'r') as f:
|
||||
lines = f.readlines()
|
||||
|
|
|
@ -30,6 +30,7 @@ class HWTSLogParser:
|
|||
output_filename (str): The output data path and name. Such as: './output_format_data_hwts_0.txt'.
|
||||
"""
|
||||
|
||||
GRAPH_MODE_MAX_TASKID = 65000
|
||||
_source_file_target_old = 'hwts.log.data.45.dev.profiler_default_tag'
|
||||
_source_file_target = 'hwts.data'
|
||||
_dst_file_title = 'title:45 HWTS data'
|
||||
|
@ -107,10 +108,10 @@ class HWTSLogParser:
|
|||
logger.info("Profiling: invalid hwts log record type %s", ms_type)
|
||||
continue
|
||||
|
||||
if task_id < task_id_threshold:
|
||||
if last_task_stream_map.get(stream_id, task_id) > task_id and self._dynamic_status:
|
||||
flip_times += 1
|
||||
task_id_str = str(stream_id) + "_" + str(task_id + flip_times * task_id_threshold)
|
||||
if HWTSLogParser.GRAPH_MODE_MAX_TASKID < last_task_stream_map.get(stream_id, task_id)\
|
||||
and task_id < last_task_stream_map.get(stream_id, task_id):
|
||||
flip_times += 1
|
||||
task_id_str = str(stream_id) + "_" + str(task_id + flip_times * task_id_threshold)
|
||||
result_data += ("%-14s %-4s %-8s %-9s %-8s %-15s %s\n" % (log_type[int(ms_type, 2)], cnt, core_id,
|
||||
blk_id, task_id_str, syscnt, stream_id))
|
||||
last_task_stream_map[stream_id] = task_id
|
||||
|
|
|
@ -130,6 +130,15 @@ class Profiler:
|
|||
_has_initialized = False
|
||||
_ascend_profiling_options = ""
|
||||
_ascend_job_id = ""
|
||||
_aicore_metrics_dict = {
|
||||
0: "ArithmeticUtilization",
|
||||
1: "PipeUtilization",
|
||||
2: "Memory",
|
||||
3: "MemoryL0",
|
||||
4: "ResourceConflictRatio",
|
||||
5: "MemoryUB",
|
||||
-1: "None"
|
||||
}
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
if Profiler._has_initialized:
|
||||
|
@ -150,6 +159,10 @@ class Profiler:
|
|||
self._rank_size = 0
|
||||
self._ascend_profiler = None
|
||||
_environment_check()
|
||||
# default aicore_metrics type is ArithmeticUtilization
|
||||
self._aicore_metrics_id = 0
|
||||
self._l2_cache = "off"
|
||||
self._parser_kwargs(kwargs)
|
||||
# get device_id and device_target
|
||||
self._get_devid_rankid_and_devtarget()
|
||||
self._get_output_path(kwargs)
|
||||
|
@ -262,8 +275,6 @@ class Profiler:
|
|||
# Setup and start MindData Profiling
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
self._md_profiler.init()
|
||||
if context.get_context("mode") == context.PYNATIVE_MODE:
|
||||
raise RuntimeError("Pynative mode is not supported on GPU currently.")
|
||||
self._parse_parameter_for_gpu(kwargs)
|
||||
|
||||
gpu_profiler = c_expression.Profiler
|
||||
|
@ -320,11 +331,12 @@ class Profiler:
|
|||
"bp_point": bp_point,
|
||||
"training_trace": "on",
|
||||
"task_trace": "on",
|
||||
"aic_metrics": "ArithmeticUtilization",
|
||||
"aic_metrics": Profiler._aicore_metrics_dict.get(self._aicore_metrics_id, "ArithmeticUtilization"),
|
||||
"aicpu": "on",
|
||||
"profile_memory": profile_memory,
|
||||
"hccl": profiler_communication,
|
||||
"parallel_strategy": "on"
|
||||
"l2_cache": self._l2_cache,
|
||||
"parallel_strategy": "on",
|
||||
}
|
||||
|
||||
return profiling_options
|
||||
|
@ -454,10 +466,7 @@ class Profiler:
|
|||
else:
|
||||
logger.info("No need to stop profiler because profiler has been stopped.")
|
||||
|
||||
if context.get_context("mode") == context.PYNATIVE_MODE:
|
||||
self._ascend_pynative_analyse()
|
||||
else:
|
||||
self._ascend_graph_analyse()
|
||||
self._ascend_graph_analyse()
|
||||
|
||||
# Call MSAdvisor function
|
||||
try:
|
||||
|
@ -701,17 +710,7 @@ class Profiler:
|
|||
self._gpu_profiler.step_profiling_enable(True)
|
||||
elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
|
||||
self._md_profiler.start()
|
||||
if context.get_context("mode") == context.PYNATIVE_MODE:
|
||||
self._ascend_pynative_start()
|
||||
else:
|
||||
self._ascend_graph_start()
|
||||
|
||||
def _ascend_pynative_start(self):
|
||||
"""Ascend pynative mode start profiling."""
|
||||
pynative_profiler = c_expression.Profiler
|
||||
self._pynative_profiler = pynative_profiler.get_instance("PyNative")
|
||||
self._pynative_profiler.init(self._output_path)
|
||||
self._ascend_profiler.start()
|
||||
self._ascend_graph_start()
|
||||
|
||||
def _ascend_graph_start(self):
|
||||
"""Ascend graph mode start profiling."""
|
||||
|
@ -763,8 +762,6 @@ class Profiler:
|
|||
if self._device_target and self._device_target == DeviceTarget.GPU.value:
|
||||
self._gpu_profiler.stop()
|
||||
elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
|
||||
if context.get_context("mode") == context.PYNATIVE_MODE:
|
||||
self._pynative_profiler.stop()
|
||||
self._ascend_profiler.stop()
|
||||
|
||||
self._stop_time = int(time.time() * 10000000)
|
||||
|
@ -1182,6 +1179,22 @@ class Profiler:
|
|||
logger.warning("The target dir already exists. "
|
||||
"There may be some old profiling data, and they will be rewritten in the end.")
|
||||
|
||||
def _parser_kwargs(self, kwargs):
|
||||
"""Parse kwargs vale."""
|
||||
self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
|
||||
if not isinstance(self._aicore_metrics_id, int) or self._aicore_metrics_id not in self._aicore_metrics_dict:
|
||||
logger.warning("aicore_metrics is an invalid value, it will be set to 0.")
|
||||
self._aicore_metrics_id = 0
|
||||
|
||||
l2_cache_enable = kwargs.pop("l2_cache", False)
|
||||
if not isinstance(l2_cache_enable, bool):
|
||||
logger.warning("l2_cache is an invalid value, it will be set to False.")
|
||||
|
||||
if l2_cache_enable:
|
||||
self._l2_cache = "on"
|
||||
else:
|
||||
self._l2_cache = "off"
|
||||
|
||||
def _analyse_hccl_info(self):
|
||||
"""Analyse hccl info."""
|
||||
hccl_path = os.path.join(self._output_path, "hccl_info_{}".format(self._rank_id))
|
||||
|
|
Loading…
Reference in New Issue