!44963 [profiler] add sync_enable parameter description

Merge pull request !44963 from zangqx/22Q3_profiler_perfect
2022-11-07 02:17:04 +00:00 · 2022-11-07 02:17:04 +00:00 · 424c0b69d9
parent 179341710f 7946c936bb
commit 424c0b69d9
5 changed files with 82 additions and 61 deletions
--- a/docs/api/api_python/mindspore/mindspore.Profiler.rst
+++ b/docs/api/api_python/mindspore/mindspore.Profiler.rst
@ -21,6 +21,10 @@ mindspore.Profiler
          - 5: MemoryUB，包含ub\_read/write_bw_mte, ub\_read/write_bw_vector, ub\_/write_bw_scalar等。

        - **l2_cache** (bool, 可选) - （仅限Ascend）是否收集l2缓存数据，当值为True时，收集这些数据。默认值：False。
+        - **sync_enable** (bool, 可选) - （仅限GPU）profiler是否用同步的方式收集算子耗时，默认值：True。
+
+          - True: 同步方式，在把算子发送到GPU之前，在CPU端记录开始时间戳。然后在算子执行完毕返回到CPU端后，再记录结束时间戳。算子耗时为两个时间戳的差值。
+          - False: 异步方式，算子耗时为从CPU发送到GPU的耗时。这种方式能减少因增加profiler对训练时间的影响。

    异常：
        - **RuntimeError** - 当CANN的版本与MindSpore版本不匹配时，生成的ascend_job_id目录结构MindSpore无法解析。
--- a/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
+++ b/mindspore/ccsrc/plugin/device/ascend/hal/device/profiling/profiling_reporter.cc
@ -106,7 +106,7 @@ void ProfilingReporter::ReportTasks() const {

    ++task_index;
  }
-  MS_LOG(INFO) << "Profiling report task data finish.";
+  MS_LOG(INFO) << "Profiling report task data finish. cnode_size: " << task_index;
 }

 // This function only report model start and model end.
--- a/mindspore/python/mindspore/profiler/common/util.py
+++ b/mindspore/python/mindspore/profiler/common/util.py
@ -21,8 +21,7 @@ import os

 # one sys count takes 10 ns, 1 ms has 100000 system count
 import re
-
-PER_MS_SYSCNT = 100000
+import stat


 def to_int(param, param_name):
@ -73,6 +72,7 @@ def fwrite_format(output_data_path, data_source=None, is_print=False, is_start=F
        else:
            f.write(data_source)
            f.write("\n")
+    os.chmod(output_data_path, stat.S_IREAD | stat.S_IWRITE)

    if is_print:
        if isinstance(data_source, (list, tuple)):
@ -279,7 +279,8 @@ def calculate_percent(partial, total):

 def to_millisecond(sys_count, limit=4):
    """Translate system count to millisecond."""
-    return round(sys_count / PER_MS_SYSCNT, limit)
+    per_ms_syscnt = 100000
+    return round(sys_count / per_ms_syscnt, limit)


 def get_field_value(row_info, field_name, header, time_type='realtime'):
--- a/mindspore/python/mindspore/profiler/parser/aicpu_data_parser.py
+++ b/mindspore/python/mindspore/profiler/parser/aicpu_data_parser.py
@ -24,41 +24,6 @@ from mindspore.profiler.common.util import fwrite_format, get_file_join_name
 from mindspore import log as logger
 from mindspore.profiler.common.struct_type import StructType

-AiCpuStruct = namedtuple(
-    'AiCpuStruct', ['magic_number', 'data_tag', 'stream_id', 'task_id', 'run_start', 'run_start_counter',
-                    'compute_start', 'memcpy_start', 'memcpy_end', 'run_end', 'run_end_counter', 'thread',
-                    'device', 'submit_tick', 'schedule_tick', 'tick_before_run', 'tick_after_fun', 'kernel_type',
-                    'dispatch_time', 'total_time', 'FFTS_thread_id', 'version']
-)
-
-AI_CPU_STRUCT = dict(
-    magic_number=StructType.UINT16,
-    data_tag=StructType.UINT16,
-    stream_id=StructType.UINT16,
-    task_id=StructType.UINT16,
-    run_start=StructType.UINT64,
-    run_start_counter=StructType.UINT64,
-
-    compute_start=StructType.UINT64,
-    memcpy_start=StructType.UINT64,
-    memcpy_end=StructType.UINT64,
-    run_end=StructType.UINT64,
-    run_end_counter=StructType.UINT64,
-    thread=StructType.UINT32,
-
-    device=StructType.UINT32,
-    submit_tick=StructType.UINT64,
-    schedule_tick=StructType.UINT64,
-    tick_before_run=StructType.UINT64,
-    tick_after_fun=StructType.UINT64,
-    kernel_type=StructType.UINT32,
-
-    dispatch_time=StructType.UINT32,
-    total_time=StructType.UINT32,
-    FFTS_thread_id=StructType.UINT16,
-    version=StructType.UINT8
-)
-

 class DataPreProcessParser:
    """
@ -69,6 +34,41 @@ class DataPreProcessParser:
         output_filename(str): The output data path and name.

    """
+    AI_CPU_STRUCT = dict(
+        magic_number=StructType.UINT16,
+        data_tag=StructType.UINT16,
+        stream_id=StructType.UINT16,
+        task_id=StructType.UINT16,
+        run_start=StructType.UINT64,
+        run_start_counter=StructType.UINT64,
+
+        compute_start=StructType.UINT64,
+        memcpy_start=StructType.UINT64,
+        memcpy_end=StructType.UINT64,
+        run_end=StructType.UINT64,
+        run_end_counter=StructType.UINT64,
+        thread=StructType.UINT32,
+
+        device=StructType.UINT32,
+        submit_tick=StructType.UINT64,
+        schedule_tick=StructType.UINT64,
+        tick_before_run=StructType.UINT64,
+        tick_after_fun=StructType.UINT64,
+        kernel_type=StructType.UINT32,
+
+        dispatch_time=StructType.UINT32,
+        total_time=StructType.UINT32,
+        FFTS_thread_id=StructType.UINT16,
+        version=StructType.UINT8
+    )
+
+    AiCpuStruct = namedtuple(
+        'AiCpuStruct', ['magic_number', 'data_tag', 'stream_id', 'task_id', 'run_start', 'run_start_counter',
+                        'compute_start', 'memcpy_start', 'memcpy_end', 'run_end', 'run_end_counter', 'thread',
+                        'device', 'submit_tick', 'schedule_tick', 'tick_before_run', 'tick_after_fun', 'kernel_type',
+                        'dispatch_time', 'total_time', 'FFTS_thread_id', 'version']
+    )
+
    _source_file_target_old = 'DATA_PREPROCESS.dev.AICPU.'
    _source_file_target = 'DATA_PREPROCESS.AICPU.'
    _dst_file_title = 'title:DATA_PREPROCESS AICPU'
@ -132,11 +132,11 @@ class DataPreProcessParser:
        serial_number = 1

        i = 0
-        ai_cpu_format = StructType.format(AI_CPU_STRUCT.values())
-        ai_cpu_size = StructType.sizeof(AI_CPU_STRUCT.values())
+        ai_cpu_format = StructType.format(DataPreProcessParser.AI_CPU_STRUCT.values())
+        ai_cpu_size = StructType.sizeof(DataPreProcessParser.AI_CPU_STRUCT.values())
        while i < len(content):
            ai_cpu_data = struct.unpack(ai_cpu_format, content[i:i + ai_cpu_size])
-            ai_cpu = AiCpuStruct(*ai_cpu_data)
+            ai_cpu = DataPreProcessParser.AiCpuStruct(*ai_cpu_data)
            if ai_cpu.task_id < self._task_id_threshold:
                node_type_name = f'{ai_cpu.stream_id}_{ai_cpu.task_id}'
                if self._op_task_dict and node_type_name in self._op_task_dict:
--- a/mindspore/python/mindspore/profiler/profiling.py
+++ b/mindspore/python/mindspore/profiler/profiling.py
@ -88,6 +88,14 @@ class Profiler:
            - 5: MemoryUB contains ub_read/write_bw_mte, ub_read/write_bw_vector, ub\_/write_bw_scalar etc.

        l2_cache (bool, optional): (Ascend only) Whether to collect l2 cache data, collect when True. Default: False.
+        sync_enable (bool, optional): (GPU only) Whether the profiler collects operators in a synchronous way.
+            Default: True.
+
+            - True: The synchronous way. Before sending the operator to the GPU, the CPU records the start timestamp.
+              Then the operator is returned to the CPU after execution, and the end timestamp is recorded,
+              The duration of the operator is the difference between the two timestamps.
+            - False: The asynchronous way. The duration of the operator is that of sending from the CPU to the GPU.
+              This method can reduce the impact of adding profiler on training time.

    Raises:
        RuntimeError: When the version of CANN does not match the version of MindSpore,
@ -137,6 +145,7 @@ class Profiler:
        ...     # Profiler end
        ...     profiler.analyse()
    """
+    SIZE_LIMIT = 500 * 1024 * 1024  # 500MB

    _hwts_output_filename_target = "output_format_data_hwts_"
    _opcompute_output_filename_target = "output_op_compute_time_"
@ -185,6 +194,7 @@ class Profiler:
        # default aicore_metrics type is ArithmeticUtilization
        self._aicore_metrics_id = 0
        self._l2_cache = "off"
+        self._data_process_enable = True
        self._parser_kwargs(kwargs)
        # get device_id and device_target
        self._get_devid_rankid_and_devtarget()
@ -389,10 +399,12 @@ class Profiler:
        self._cpu_profiler.step_profiling_enable(True)

        if self._device_target and self._device_target == DeviceTarget.GPU.value:
-            self._md_profiler.start()
+            if self._data_process_enable:
+                self._md_profiler.start()
            self._gpu_profiler.step_profiling_enable(True)
        elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
-            self._md_profiler.start()
+            if self._data_process_enable:
+                self._md_profiler.start()
            self._ascend_graph_start()
        ProfilerInfo.set_profiling_start_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))

@ -437,9 +449,9 @@ class Profiler:
        # No need to stop anything if parse profiling data offline
        if self._is_offline_parser():
            return
-
-        self._md_profiler.stop()
-        self._md_profiler.save(self._output_path)
+        if self._data_process_enable:
+            self._md_profiler.stop()
+            self._md_profiler.save(self._output_path)

        if self._device_target and self._device_target == DeviceTarget.GPU.value:
            self._gpu_profiler.stop()
@ -462,7 +474,8 @@ class Profiler:
        self._device_target = context.get_context("device_target").lower()
        self._profiler_manager = c_expression.ProfilerManager.get_instance()
        self._cpu_profiler = c_expression.Profiler.get_instance("CPU")
-        self._md_profiler = cde.GlobalContext.profiling_manager()
+        if self._data_process_enable:
+            self._md_profiler = cde.GlobalContext.profiling_manager()
        if self._device_target == DeviceTarget.GPU.value:
            self._gpu_profiler = c_expression.Profiler.get_instance("GPU")

@ -516,8 +529,9 @@ class Profiler:
    def _gpu_profiler_init(self, kwargs):
        """Gpu profiler init."""
        # Setup and start MindData Profiling
-        self._md_profiler = cde.GlobalContext.profiling_manager()
-        self._md_profiler.init()
+        if self._data_process_enable:
+            self._md_profiler = cde.GlobalContext.profiling_manager()
+            self._md_profiler.init()
        self._parse_parameter_for_gpu(kwargs)

        gpu_profiler = c_expression.Profiler
@ -532,8 +546,9 @@ class Profiler:
    def _ascend_profiler_init(self, kwargs):
        """Ascend profiler init."""
        # Setup and start MindData Profiling
-        self._md_profiler = cde.GlobalContext.profiling_manager()
-        self._md_profiler.init()
+        if self._data_process_enable:
+            self._md_profiler = cde.GlobalContext.profiling_manager()
+            self._md_profiler.init()
        self._init_time = int(time.time() * 10000000)
        logger.info("Profiling: profiling init time: %d", self._init_time)
        self._parse_parameter_for_ascend(kwargs)
@ -675,8 +690,7 @@ class Profiler:
        timeline_analyser = AscendTimelineGenerator(self._output_path, self._dev_id, self._rank_id,
                                                    self._rank_size, context.get_context("mode"))
        timeline_analyser.init_pynative_timeline()
-        size_limit = 100 * 1024 * 1024  # 100MB
-        timeline_analyser.write_timeline(size_limit)
+        timeline_analyser.write_timeline(Profiler.SIZE_LIMIT)
        timeline_analyser.write_timeline_summary()

    def _ascend_analyse(self):
@ -957,11 +971,10 @@ class Profiler:
    def _cpu_analyse(self):
        """Collect and analyse cpu performance data."""

-        size_limit = 100 * 1024 * 1024  # 100MB
        try:
            timeline_generator = CpuTimelineGenerator(self._output_path, context.get_context("mode"))
            timeline_generator.init_timeline()
-            timeline_generator.write_timeline(size_limit)
+            timeline_generator.write_timeline(Profiler.SIZE_LIMIT)
            timeline_generator.write_timeline_summary()
        except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
            logger.warning('Fail to write timeline data: %s', err)
@ -1057,18 +1070,16 @@ class Profiler:
        min_cycle_counter = min(aicpu_parser.min_cycle_counter, optime_parser.min_cycle_counter)
        timeline_analyser.init_timeline(all_reduce_info, framework_info, aicpu_info,
                                        min_cycle_counter, source_path)
-        size_limit = 100 * 1024 * 1024  # 100MB
-        timeline_analyser.write_timeline(size_limit)
+        timeline_analyser.write_timeline(Profiler.SIZE_LIMIT)
        timeline_analyser.write_timeline_summary()

    def _generate_timeline(self, reduce_op_type):
        """Used for gpu, generate timeline info, write to json format file."""
        try:
-            size_limit = 100 * 1024 * 1024  # 100MB
            timeline_generator = GpuTimelineGenerator(self._output_path, self._dev_id, self._rank_size,
                                                      context.get_context("mode"))
            timeline_generator.init_timeline(reduce_op_type)
-            self._timeline_meta = timeline_generator.write_timeline(size_limit)
+            self._timeline_meta = timeline_generator.write_timeline(Profiler.SIZE_LIMIT)
            timeline_generator.write_timeline_summary()
            return timeline_generator
        except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
@ -1290,12 +1301,16 @@ class Profiler:
        if not isinstance(l2_cache_enable, bool):
            raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
                            f"but got type {type(l2_cache_enable)}")
-
        if l2_cache_enable:
            self._l2_cache = "on"
        else:
            self._l2_cache = "off"

+        self._data_process_enable = kwargs.pop("data_process_enable", True)
+        if not isinstance(self._data_process_enable, bool):
+            raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process_enable must be bool, "
+                            f"but got type {type(self.data_process_enable)}")
+
    def _analyse_hccl_info(self):
        """Analyse hccl info."""
        hccl_path = os.path.join(self._output_path, "hccl_info_{}".format(self._rank_id))
@ -1314,6 +1329,7 @@ class Profiler:
                            "The hccl_parser-{version}-py3-none-any.whl package is usually located "
                            "in the /usr/local/Ascend/tools Directory", err)
            raise ImportError(err) from err
+
        logger.info("Parse hccl info successfully.")
        logger.info("Start analyse hccl info.")
        hccl_parse = HcclParser(hccl_path, self._dev_id, self._rank_id, self._output_path)