!44963 [profiler] add sync_enable parameter description
Merge pull request !44963 from zangqx/22Q3_profiler_perfect
This commit is contained in:
commit
424c0b69d9
|
@ -21,6 +21,10 @@ mindspore.Profiler
|
|||
- 5: MemoryUB,包含ub\_read/write_bw_mte, ub\_read/write_bw_vector, ub\_/write_bw_scalar等。
|
||||
|
||||
- **l2_cache** (bool, 可选) - (仅限Ascend)是否收集l2缓存数据,当值为True时,收集这些数据。默认值:False。
|
||||
- **sync_enable** (bool, 可选) - (仅限GPU)profiler是否用同步的方式收集算子耗时,默认值:True。
|
||||
|
||||
- True: 同步方式,在把算子发送到GPU之前,在CPU端记录开始时间戳。然后在算子执行完毕返回到CPU端后,再记录结束时间戳。算子耗时为两个时间戳的差值。
|
||||
- False: 异步方式,算子耗时为从CPU发送到GPU的耗时。这种方式能减少因增加profiler对训练时间的影响。
|
||||
|
||||
异常:
|
||||
- **RuntimeError** - 当CANN的版本与MindSpore版本不匹配时,生成的ascend_job_id目录结构MindSpore无法解析。
|
||||
|
|
|
@ -106,7 +106,7 @@ void ProfilingReporter::ReportTasks() const {
|
|||
|
||||
++task_index;
|
||||
}
|
||||
MS_LOG(INFO) << "Profiling report task data finish.";
|
||||
MS_LOG(INFO) << "Profiling report task data finish. cnode_size: " << task_index;
|
||||
}
|
||||
|
||||
// This function only report model start and model end.
|
||||
|
|
|
@ -21,8 +21,7 @@ import os
|
|||
|
||||
# one sys count takes 10 ns, 1 ms has 100000 system count
|
||||
import re
|
||||
|
||||
PER_MS_SYSCNT = 100000
|
||||
import stat
|
||||
|
||||
|
||||
def to_int(param, param_name):
|
||||
|
@ -73,6 +72,7 @@ def fwrite_format(output_data_path, data_source=None, is_print=False, is_start=F
|
|||
else:
|
||||
f.write(data_source)
|
||||
f.write("\n")
|
||||
os.chmod(output_data_path, stat.S_IREAD | stat.S_IWRITE)
|
||||
|
||||
if is_print:
|
||||
if isinstance(data_source, (list, tuple)):
|
||||
|
@ -279,7 +279,8 @@ def calculate_percent(partial, total):
|
|||
|
||||
def to_millisecond(sys_count, limit=4):
|
||||
"""Translate system count to millisecond."""
|
||||
return round(sys_count / PER_MS_SYSCNT, limit)
|
||||
per_ms_syscnt = 100000
|
||||
return round(sys_count / per_ms_syscnt, limit)
|
||||
|
||||
|
||||
def get_field_value(row_info, field_name, header, time_type='realtime'):
|
||||
|
|
|
@ -24,41 +24,6 @@ from mindspore.profiler.common.util import fwrite_format, get_file_join_name
|
|||
from mindspore import log as logger
|
||||
from mindspore.profiler.common.struct_type import StructType
|
||||
|
||||
AiCpuStruct = namedtuple(
|
||||
'AiCpuStruct', ['magic_number', 'data_tag', 'stream_id', 'task_id', 'run_start', 'run_start_counter',
|
||||
'compute_start', 'memcpy_start', 'memcpy_end', 'run_end', 'run_end_counter', 'thread',
|
||||
'device', 'submit_tick', 'schedule_tick', 'tick_before_run', 'tick_after_fun', 'kernel_type',
|
||||
'dispatch_time', 'total_time', 'FFTS_thread_id', 'version']
|
||||
)
|
||||
|
||||
AI_CPU_STRUCT = dict(
|
||||
magic_number=StructType.UINT16,
|
||||
data_tag=StructType.UINT16,
|
||||
stream_id=StructType.UINT16,
|
||||
task_id=StructType.UINT16,
|
||||
run_start=StructType.UINT64,
|
||||
run_start_counter=StructType.UINT64,
|
||||
|
||||
compute_start=StructType.UINT64,
|
||||
memcpy_start=StructType.UINT64,
|
||||
memcpy_end=StructType.UINT64,
|
||||
run_end=StructType.UINT64,
|
||||
run_end_counter=StructType.UINT64,
|
||||
thread=StructType.UINT32,
|
||||
|
||||
device=StructType.UINT32,
|
||||
submit_tick=StructType.UINT64,
|
||||
schedule_tick=StructType.UINT64,
|
||||
tick_before_run=StructType.UINT64,
|
||||
tick_after_fun=StructType.UINT64,
|
||||
kernel_type=StructType.UINT32,
|
||||
|
||||
dispatch_time=StructType.UINT32,
|
||||
total_time=StructType.UINT32,
|
||||
FFTS_thread_id=StructType.UINT16,
|
||||
version=StructType.UINT8
|
||||
)
|
||||
|
||||
|
||||
class DataPreProcessParser:
|
||||
"""
|
||||
|
@ -69,6 +34,41 @@ class DataPreProcessParser:
|
|||
output_filename(str): The output data path and name.
|
||||
|
||||
"""
|
||||
AI_CPU_STRUCT = dict(
|
||||
magic_number=StructType.UINT16,
|
||||
data_tag=StructType.UINT16,
|
||||
stream_id=StructType.UINT16,
|
||||
task_id=StructType.UINT16,
|
||||
run_start=StructType.UINT64,
|
||||
run_start_counter=StructType.UINT64,
|
||||
|
||||
compute_start=StructType.UINT64,
|
||||
memcpy_start=StructType.UINT64,
|
||||
memcpy_end=StructType.UINT64,
|
||||
run_end=StructType.UINT64,
|
||||
run_end_counter=StructType.UINT64,
|
||||
thread=StructType.UINT32,
|
||||
|
||||
device=StructType.UINT32,
|
||||
submit_tick=StructType.UINT64,
|
||||
schedule_tick=StructType.UINT64,
|
||||
tick_before_run=StructType.UINT64,
|
||||
tick_after_fun=StructType.UINT64,
|
||||
kernel_type=StructType.UINT32,
|
||||
|
||||
dispatch_time=StructType.UINT32,
|
||||
total_time=StructType.UINT32,
|
||||
FFTS_thread_id=StructType.UINT16,
|
||||
version=StructType.UINT8
|
||||
)
|
||||
|
||||
AiCpuStruct = namedtuple(
|
||||
'AiCpuStruct', ['magic_number', 'data_tag', 'stream_id', 'task_id', 'run_start', 'run_start_counter',
|
||||
'compute_start', 'memcpy_start', 'memcpy_end', 'run_end', 'run_end_counter', 'thread',
|
||||
'device', 'submit_tick', 'schedule_tick', 'tick_before_run', 'tick_after_fun', 'kernel_type',
|
||||
'dispatch_time', 'total_time', 'FFTS_thread_id', 'version']
|
||||
)
|
||||
|
||||
_source_file_target_old = 'DATA_PREPROCESS.dev.AICPU.'
|
||||
_source_file_target = 'DATA_PREPROCESS.AICPU.'
|
||||
_dst_file_title = 'title:DATA_PREPROCESS AICPU'
|
||||
|
@ -132,11 +132,11 @@ class DataPreProcessParser:
|
|||
serial_number = 1
|
||||
|
||||
i = 0
|
||||
ai_cpu_format = StructType.format(AI_CPU_STRUCT.values())
|
||||
ai_cpu_size = StructType.sizeof(AI_CPU_STRUCT.values())
|
||||
ai_cpu_format = StructType.format(DataPreProcessParser.AI_CPU_STRUCT.values())
|
||||
ai_cpu_size = StructType.sizeof(DataPreProcessParser.AI_CPU_STRUCT.values())
|
||||
while i < len(content):
|
||||
ai_cpu_data = struct.unpack(ai_cpu_format, content[i:i + ai_cpu_size])
|
||||
ai_cpu = AiCpuStruct(*ai_cpu_data)
|
||||
ai_cpu = DataPreProcessParser.AiCpuStruct(*ai_cpu_data)
|
||||
if ai_cpu.task_id < self._task_id_threshold:
|
||||
node_type_name = f'{ai_cpu.stream_id}_{ai_cpu.task_id}'
|
||||
if self._op_task_dict and node_type_name in self._op_task_dict:
|
||||
|
|
|
@ -88,6 +88,14 @@ class Profiler:
|
|||
- 5: MemoryUB contains ub_read/write_bw_mte, ub_read/write_bw_vector, ub\_/write_bw_scalar etc.
|
||||
|
||||
l2_cache (bool, optional): (Ascend only) Whether to collect l2 cache data, collect when True. Default: False.
|
||||
sync_enable (bool, optional): (GPU only) Whether the profiler collects operators in a synchronous way.
|
||||
Default: True.
|
||||
|
||||
- True: The synchronous way. Before sending the operator to the GPU, the CPU records the start timestamp.
|
||||
Then the operator is returned to the CPU after execution, and the end timestamp is recorded,
|
||||
The duration of the operator is the difference between the two timestamps.
|
||||
- False: The asynchronous way. The duration of the operator is that of sending from the CPU to the GPU.
|
||||
This method can reduce the impact of adding profiler on training time.
|
||||
|
||||
Raises:
|
||||
RuntimeError: When the version of CANN does not match the version of MindSpore,
|
||||
|
@ -137,6 +145,7 @@ class Profiler:
|
|||
... # Profiler end
|
||||
... profiler.analyse()
|
||||
"""
|
||||
SIZE_LIMIT = 500 * 1024 * 1024 # 500MB
|
||||
|
||||
_hwts_output_filename_target = "output_format_data_hwts_"
|
||||
_opcompute_output_filename_target = "output_op_compute_time_"
|
||||
|
@ -185,6 +194,7 @@ class Profiler:
|
|||
# default aicore_metrics type is ArithmeticUtilization
|
||||
self._aicore_metrics_id = 0
|
||||
self._l2_cache = "off"
|
||||
self._data_process_enable = True
|
||||
self._parser_kwargs(kwargs)
|
||||
# get device_id and device_target
|
||||
self._get_devid_rankid_and_devtarget()
|
||||
|
@ -389,10 +399,12 @@ class Profiler:
|
|||
self._cpu_profiler.step_profiling_enable(True)
|
||||
|
||||
if self._device_target and self._device_target == DeviceTarget.GPU.value:
|
||||
self._md_profiler.start()
|
||||
if self._data_process_enable:
|
||||
self._md_profiler.start()
|
||||
self._gpu_profiler.step_profiling_enable(True)
|
||||
elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
|
||||
self._md_profiler.start()
|
||||
if self._data_process_enable:
|
||||
self._md_profiler.start()
|
||||
self._ascend_graph_start()
|
||||
ProfilerInfo.set_profiling_start_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
|
||||
|
||||
|
@ -437,9 +449,9 @@ class Profiler:
|
|||
# No need to stop anything if parse profiling data offline
|
||||
if self._is_offline_parser():
|
||||
return
|
||||
|
||||
self._md_profiler.stop()
|
||||
self._md_profiler.save(self._output_path)
|
||||
if self._data_process_enable:
|
||||
self._md_profiler.stop()
|
||||
self._md_profiler.save(self._output_path)
|
||||
|
||||
if self._device_target and self._device_target == DeviceTarget.GPU.value:
|
||||
self._gpu_profiler.stop()
|
||||
|
@ -462,7 +474,8 @@ class Profiler:
|
|||
self._device_target = context.get_context("device_target").lower()
|
||||
self._profiler_manager = c_expression.ProfilerManager.get_instance()
|
||||
self._cpu_profiler = c_expression.Profiler.get_instance("CPU")
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
if self._data_process_enable:
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
if self._device_target == DeviceTarget.GPU.value:
|
||||
self._gpu_profiler = c_expression.Profiler.get_instance("GPU")
|
||||
|
||||
|
@ -516,8 +529,9 @@ class Profiler:
|
|||
def _gpu_profiler_init(self, kwargs):
|
||||
"""Gpu profiler init."""
|
||||
# Setup and start MindData Profiling
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
self._md_profiler.init()
|
||||
if self._data_process_enable:
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
self._md_profiler.init()
|
||||
self._parse_parameter_for_gpu(kwargs)
|
||||
|
||||
gpu_profiler = c_expression.Profiler
|
||||
|
@ -532,8 +546,9 @@ class Profiler:
|
|||
def _ascend_profiler_init(self, kwargs):
|
||||
"""Ascend profiler init."""
|
||||
# Setup and start MindData Profiling
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
self._md_profiler.init()
|
||||
if self._data_process_enable:
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
self._md_profiler.init()
|
||||
self._init_time = int(time.time() * 10000000)
|
||||
logger.info("Profiling: profiling init time: %d", self._init_time)
|
||||
self._parse_parameter_for_ascend(kwargs)
|
||||
|
@ -675,8 +690,7 @@ class Profiler:
|
|||
timeline_analyser = AscendTimelineGenerator(self._output_path, self._dev_id, self._rank_id,
|
||||
self._rank_size, context.get_context("mode"))
|
||||
timeline_analyser.init_pynative_timeline()
|
||||
size_limit = 100 * 1024 * 1024 # 100MB
|
||||
timeline_analyser.write_timeline(size_limit)
|
||||
timeline_analyser.write_timeline(Profiler.SIZE_LIMIT)
|
||||
timeline_analyser.write_timeline_summary()
|
||||
|
||||
def _ascend_analyse(self):
|
||||
|
@ -957,11 +971,10 @@ class Profiler:
|
|||
def _cpu_analyse(self):
|
||||
"""Collect and analyse cpu performance data."""
|
||||
|
||||
size_limit = 100 * 1024 * 1024 # 100MB
|
||||
try:
|
||||
timeline_generator = CpuTimelineGenerator(self._output_path, context.get_context("mode"))
|
||||
timeline_generator.init_timeline()
|
||||
timeline_generator.write_timeline(size_limit)
|
||||
timeline_generator.write_timeline(Profiler.SIZE_LIMIT)
|
||||
timeline_generator.write_timeline_summary()
|
||||
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
|
||||
logger.warning('Fail to write timeline data: %s', err)
|
||||
|
@ -1057,18 +1070,16 @@ class Profiler:
|
|||
min_cycle_counter = min(aicpu_parser.min_cycle_counter, optime_parser.min_cycle_counter)
|
||||
timeline_analyser.init_timeline(all_reduce_info, framework_info, aicpu_info,
|
||||
min_cycle_counter, source_path)
|
||||
size_limit = 100 * 1024 * 1024 # 100MB
|
||||
timeline_analyser.write_timeline(size_limit)
|
||||
timeline_analyser.write_timeline(Profiler.SIZE_LIMIT)
|
||||
timeline_analyser.write_timeline_summary()
|
||||
|
||||
def _generate_timeline(self, reduce_op_type):
|
||||
"""Used for gpu, generate timeline info, write to json format file."""
|
||||
try:
|
||||
size_limit = 100 * 1024 * 1024 # 100MB
|
||||
timeline_generator = GpuTimelineGenerator(self._output_path, self._dev_id, self._rank_size,
|
||||
context.get_context("mode"))
|
||||
timeline_generator.init_timeline(reduce_op_type)
|
||||
self._timeline_meta = timeline_generator.write_timeline(size_limit)
|
||||
self._timeline_meta = timeline_generator.write_timeline(Profiler.SIZE_LIMIT)
|
||||
timeline_generator.write_timeline_summary()
|
||||
return timeline_generator
|
||||
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
|
||||
|
@ -1290,12 +1301,16 @@ class Profiler:
|
|||
if not isinstance(l2_cache_enable, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
|
||||
f"but got type {type(l2_cache_enable)}")
|
||||
|
||||
if l2_cache_enable:
|
||||
self._l2_cache = "on"
|
||||
else:
|
||||
self._l2_cache = "off"
|
||||
|
||||
self._data_process_enable = kwargs.pop("data_process_enable", True)
|
||||
if not isinstance(self._data_process_enable, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process_enable must be bool, "
|
||||
f"but got type {type(self.data_process_enable)}")
|
||||
|
||||
def _analyse_hccl_info(self):
|
||||
"""Analyse hccl info."""
|
||||
hccl_path = os.path.join(self._output_path, "hccl_info_{}".format(self._rank_id))
|
||||
|
@ -1314,6 +1329,7 @@ class Profiler:
|
|||
"The hccl_parser-{version}-py3-none-any.whl package is usually located "
|
||||
"in the /usr/local/Ascend/tools Directory", err)
|
||||
raise ImportError(err) from err
|
||||
|
||||
logger.info("Parse hccl info successfully.")
|
||||
logger.info("Start analyse hccl info.")
|
||||
hccl_parse = HcclParser(hccl_path, self._dev_id, self._rank_id, self._output_path)
|
||||
|
|
Loading…
Reference in New Issue