!44963 [profiler] add sync_enable parameter description

Merge pull request !44963 from zangqx/22Q3_profiler_perfect
This commit is contained in:
i-robot 2022-11-07 02:17:04 +00:00 committed by Gitee
commit 424c0b69d9
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
5 changed files with 82 additions and 61 deletions

View File

@ -21,6 +21,10 @@ mindspore.Profiler
- 5: MemoryUB包含ub\_read/write_bw_mte, ub\_read/write_bw_vector, ub\_/write_bw_scalar等。
- **l2_cache** (bool, 可选) - 仅限Ascend是否收集l2缓存数据当值为True时收集这些数据。默认值False。
- **sync_enable** (bool, 可选) - 仅限GPUprofiler是否用同步的方式收集算子耗时默认值True。
- True: 同步方式在把算子发送到GPU之前在CPU端记录开始时间戳。然后在算子执行完毕返回到CPU端后再记录结束时间戳。算子耗时为两个时间戳的差值。
- False: 异步方式算子耗时为从CPU发送到GPU的耗时。这种方式能减少因增加profiler对训练时间的影响。
异常:
- **RuntimeError** - 当CANN的版本与MindSpore版本不匹配时生成的ascend_job_id目录结构MindSpore无法解析。

View File

@ -106,7 +106,7 @@ void ProfilingReporter::ReportTasks() const {
++task_index;
}
MS_LOG(INFO) << "Profiling report task data finish.";
MS_LOG(INFO) << "Profiling report task data finish. cnode_size: " << task_index;
}
// This function only report model start and model end.

View File

@ -21,8 +21,7 @@ import os
# one sys count takes 10 ns, 1 ms has 100000 system count
import re
PER_MS_SYSCNT = 100000
import stat
def to_int(param, param_name):
@ -73,6 +72,7 @@ def fwrite_format(output_data_path, data_source=None, is_print=False, is_start=F
else:
f.write(data_source)
f.write("\n")
os.chmod(output_data_path, stat.S_IREAD | stat.S_IWRITE)
if is_print:
if isinstance(data_source, (list, tuple)):
@ -279,7 +279,8 @@ def calculate_percent(partial, total):
def to_millisecond(sys_count, limit=4):
"""Translate system count to millisecond."""
return round(sys_count / PER_MS_SYSCNT, limit)
per_ms_syscnt = 100000
return round(sys_count / per_ms_syscnt, limit)
def get_field_value(row_info, field_name, header, time_type='realtime'):

View File

@ -24,41 +24,6 @@ from mindspore.profiler.common.util import fwrite_format, get_file_join_name
from mindspore import log as logger
from mindspore.profiler.common.struct_type import StructType
AiCpuStruct = namedtuple(
'AiCpuStruct', ['magic_number', 'data_tag', 'stream_id', 'task_id', 'run_start', 'run_start_counter',
'compute_start', 'memcpy_start', 'memcpy_end', 'run_end', 'run_end_counter', 'thread',
'device', 'submit_tick', 'schedule_tick', 'tick_before_run', 'tick_after_fun', 'kernel_type',
'dispatch_time', 'total_time', 'FFTS_thread_id', 'version']
)
AI_CPU_STRUCT = dict(
magic_number=StructType.UINT16,
data_tag=StructType.UINT16,
stream_id=StructType.UINT16,
task_id=StructType.UINT16,
run_start=StructType.UINT64,
run_start_counter=StructType.UINT64,
compute_start=StructType.UINT64,
memcpy_start=StructType.UINT64,
memcpy_end=StructType.UINT64,
run_end=StructType.UINT64,
run_end_counter=StructType.UINT64,
thread=StructType.UINT32,
device=StructType.UINT32,
submit_tick=StructType.UINT64,
schedule_tick=StructType.UINT64,
tick_before_run=StructType.UINT64,
tick_after_fun=StructType.UINT64,
kernel_type=StructType.UINT32,
dispatch_time=StructType.UINT32,
total_time=StructType.UINT32,
FFTS_thread_id=StructType.UINT16,
version=StructType.UINT8
)
class DataPreProcessParser:
"""
@ -69,6 +34,41 @@ class DataPreProcessParser:
output_filename(str): The output data path and name.
"""
AI_CPU_STRUCT = dict(
magic_number=StructType.UINT16,
data_tag=StructType.UINT16,
stream_id=StructType.UINT16,
task_id=StructType.UINT16,
run_start=StructType.UINT64,
run_start_counter=StructType.UINT64,
compute_start=StructType.UINT64,
memcpy_start=StructType.UINT64,
memcpy_end=StructType.UINT64,
run_end=StructType.UINT64,
run_end_counter=StructType.UINT64,
thread=StructType.UINT32,
device=StructType.UINT32,
submit_tick=StructType.UINT64,
schedule_tick=StructType.UINT64,
tick_before_run=StructType.UINT64,
tick_after_fun=StructType.UINT64,
kernel_type=StructType.UINT32,
dispatch_time=StructType.UINT32,
total_time=StructType.UINT32,
FFTS_thread_id=StructType.UINT16,
version=StructType.UINT8
)
AiCpuStruct = namedtuple(
'AiCpuStruct', ['magic_number', 'data_tag', 'stream_id', 'task_id', 'run_start', 'run_start_counter',
'compute_start', 'memcpy_start', 'memcpy_end', 'run_end', 'run_end_counter', 'thread',
'device', 'submit_tick', 'schedule_tick', 'tick_before_run', 'tick_after_fun', 'kernel_type',
'dispatch_time', 'total_time', 'FFTS_thread_id', 'version']
)
_source_file_target_old = 'DATA_PREPROCESS.dev.AICPU.'
_source_file_target = 'DATA_PREPROCESS.AICPU.'
_dst_file_title = 'title:DATA_PREPROCESS AICPU'
@ -132,11 +132,11 @@ class DataPreProcessParser:
serial_number = 1
i = 0
ai_cpu_format = StructType.format(AI_CPU_STRUCT.values())
ai_cpu_size = StructType.sizeof(AI_CPU_STRUCT.values())
ai_cpu_format = StructType.format(DataPreProcessParser.AI_CPU_STRUCT.values())
ai_cpu_size = StructType.sizeof(DataPreProcessParser.AI_CPU_STRUCT.values())
while i < len(content):
ai_cpu_data = struct.unpack(ai_cpu_format, content[i:i + ai_cpu_size])
ai_cpu = AiCpuStruct(*ai_cpu_data)
ai_cpu = DataPreProcessParser.AiCpuStruct(*ai_cpu_data)
if ai_cpu.task_id < self._task_id_threshold:
node_type_name = f'{ai_cpu.stream_id}_{ai_cpu.task_id}'
if self._op_task_dict and node_type_name in self._op_task_dict:

View File

@ -88,6 +88,14 @@ class Profiler:
- 5: MemoryUB contains ub_read/write_bw_mte, ub_read/write_bw_vector, ub\_/write_bw_scalar etc.
l2_cache (bool, optional): (Ascend only) Whether to collect l2 cache data, collect when True. Default: False.
sync_enable (bool, optional): (GPU only) Whether the profiler collects operators in a synchronous way.
Default: True.
- True: The synchronous way. Before sending the operator to the GPU, the CPU records the start timestamp.
Then the operator is returned to the CPU after execution, and the end timestamp is recorded,
The duration of the operator is the difference between the two timestamps.
- False: The asynchronous way. The duration of the operator is that of sending from the CPU to the GPU.
This method can reduce the impact of adding profiler on training time.
Raises:
RuntimeError: When the version of CANN does not match the version of MindSpore,
@ -137,6 +145,7 @@ class Profiler:
... # Profiler end
... profiler.analyse()
"""
SIZE_LIMIT = 500 * 1024 * 1024 # 500MB
_hwts_output_filename_target = "output_format_data_hwts_"
_opcompute_output_filename_target = "output_op_compute_time_"
@ -185,6 +194,7 @@ class Profiler:
# default aicore_metrics type is ArithmeticUtilization
self._aicore_metrics_id = 0
self._l2_cache = "off"
self._data_process_enable = True
self._parser_kwargs(kwargs)
# get device_id and device_target
self._get_devid_rankid_and_devtarget()
@ -389,10 +399,12 @@ class Profiler:
self._cpu_profiler.step_profiling_enable(True)
if self._device_target and self._device_target == DeviceTarget.GPU.value:
self._md_profiler.start()
if self._data_process_enable:
self._md_profiler.start()
self._gpu_profiler.step_profiling_enable(True)
elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
self._md_profiler.start()
if self._data_process_enable:
self._md_profiler.start()
self._ascend_graph_start()
ProfilerInfo.set_profiling_start_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
@ -437,9 +449,9 @@ class Profiler:
# No need to stop anything if parse profiling data offline
if self._is_offline_parser():
return
self._md_profiler.stop()
self._md_profiler.save(self._output_path)
if self._data_process_enable:
self._md_profiler.stop()
self._md_profiler.save(self._output_path)
if self._device_target and self._device_target == DeviceTarget.GPU.value:
self._gpu_profiler.stop()
@ -462,7 +474,8 @@ class Profiler:
self._device_target = context.get_context("device_target").lower()
self._profiler_manager = c_expression.ProfilerManager.get_instance()
self._cpu_profiler = c_expression.Profiler.get_instance("CPU")
self._md_profiler = cde.GlobalContext.profiling_manager()
if self._data_process_enable:
self._md_profiler = cde.GlobalContext.profiling_manager()
if self._device_target == DeviceTarget.GPU.value:
self._gpu_profiler = c_expression.Profiler.get_instance("GPU")
@ -516,8 +529,9 @@ class Profiler:
def _gpu_profiler_init(self, kwargs):
"""Gpu profiler init."""
# Setup and start MindData Profiling
self._md_profiler = cde.GlobalContext.profiling_manager()
self._md_profiler.init()
if self._data_process_enable:
self._md_profiler = cde.GlobalContext.profiling_manager()
self._md_profiler.init()
self._parse_parameter_for_gpu(kwargs)
gpu_profiler = c_expression.Profiler
@ -532,8 +546,9 @@ class Profiler:
def _ascend_profiler_init(self, kwargs):
"""Ascend profiler init."""
# Setup and start MindData Profiling
self._md_profiler = cde.GlobalContext.profiling_manager()
self._md_profiler.init()
if self._data_process_enable:
self._md_profiler = cde.GlobalContext.profiling_manager()
self._md_profiler.init()
self._init_time = int(time.time() * 10000000)
logger.info("Profiling: profiling init time: %d", self._init_time)
self._parse_parameter_for_ascend(kwargs)
@ -675,8 +690,7 @@ class Profiler:
timeline_analyser = AscendTimelineGenerator(self._output_path, self._dev_id, self._rank_id,
self._rank_size, context.get_context("mode"))
timeline_analyser.init_pynative_timeline()
size_limit = 100 * 1024 * 1024 # 100MB
timeline_analyser.write_timeline(size_limit)
timeline_analyser.write_timeline(Profiler.SIZE_LIMIT)
timeline_analyser.write_timeline_summary()
def _ascend_analyse(self):
@ -957,11 +971,10 @@ class Profiler:
def _cpu_analyse(self):
"""Collect and analyse cpu performance data."""
size_limit = 100 * 1024 * 1024 # 100MB
try:
timeline_generator = CpuTimelineGenerator(self._output_path, context.get_context("mode"))
timeline_generator.init_timeline()
timeline_generator.write_timeline(size_limit)
timeline_generator.write_timeline(Profiler.SIZE_LIMIT)
timeline_generator.write_timeline_summary()
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
logger.warning('Fail to write timeline data: %s', err)
@ -1057,18 +1070,16 @@ class Profiler:
min_cycle_counter = min(aicpu_parser.min_cycle_counter, optime_parser.min_cycle_counter)
timeline_analyser.init_timeline(all_reduce_info, framework_info, aicpu_info,
min_cycle_counter, source_path)
size_limit = 100 * 1024 * 1024 # 100MB
timeline_analyser.write_timeline(size_limit)
timeline_analyser.write_timeline(Profiler.SIZE_LIMIT)
timeline_analyser.write_timeline_summary()
def _generate_timeline(self, reduce_op_type):
"""Used for gpu, generate timeline info, write to json format file."""
try:
size_limit = 100 * 1024 * 1024 # 100MB
timeline_generator = GpuTimelineGenerator(self._output_path, self._dev_id, self._rank_size,
context.get_context("mode"))
timeline_generator.init_timeline(reduce_op_type)
self._timeline_meta = timeline_generator.write_timeline(size_limit)
self._timeline_meta = timeline_generator.write_timeline(Profiler.SIZE_LIMIT)
timeline_generator.write_timeline_summary()
return timeline_generator
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
@ -1290,12 +1301,16 @@ class Profiler:
if not isinstance(l2_cache_enable, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
f"but got type {type(l2_cache_enable)}")
if l2_cache_enable:
self._l2_cache = "on"
else:
self._l2_cache = "off"
self._data_process_enable = kwargs.pop("data_process_enable", True)
if not isinstance(self._data_process_enable, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process_enable must be bool, "
f"but got type {type(self.data_process_enable)}")
def _analyse_hccl_info(self):
"""Analyse hccl info."""
hccl_path = os.path.join(self._output_path, "hccl_info_{}".format(self._rank_id))
@ -1314,6 +1329,7 @@ class Profiler:
"The hccl_parser-{version}-py3-none-any.whl package is usually located "
"in the /usr/local/Ascend/tools Directory", err)
raise ImportError(err) from err
logger.info("Parse hccl info successfully.")
logger.info("Start analyse hccl info.")
hccl_parse = HcclParser(hccl_path, self._dev_id, self._rank_id, self._output_path)