!30003 add cpu timeline profiling and change threadpool default threads

Merge pull request !30003 from fangzehua/add_profi
2022-02-15 06:21:15 +00:00 · 2022-02-15 06:21:15 +00:00 · d37c8719fd
parent 564f6089c6 4bb4417aff
commit d37c8719fd
6 changed files with 81 additions and 8 deletions
--- a/docs/api/api_python/mindspore.context.rst
+++ b/docs/api/api_python/mindspore.context.rst
@ -138,7 +138,7 @@ MindSpore context，用于配置当前执行环境，包括执行模式、执行
    - **grad_for_scalar** (bool)：  表示是否获取标量梯度。默认值：False。当 `grad_for_scalar` 设置为True时，则可以导出函数的标量输入。由于后端目前不支持伸缩操作，所以该接口只支持在前端可推演的简单操作。
    - **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时，在第一次执行的过程中，一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时，如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改，那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测，这意味着可能有正确性风险。默认值：False。这是一个实验特性，可能会被更改或者删除。
    - **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值："."。如果目录不存在，系统会自动创建这个目录。缓存会被保存到如下目录： `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。
-    - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为系统线程数的0.6倍。
+    - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为30。

    **异常：**

--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc
@ -30,7 +30,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread
  const size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
  auto runtime_num_threads = static_cast<size_t>(context_ptr->get_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS));
  size_t runtime_num_threads_min = std::min(runtime_num_threads, cpu_core_num);
-  const float kActorUsage = 0.2;
+  const float kActorUsage = 0.18;
  const size_t kActorThreadMinNum = 2;
  size_t actor_thread_max_num =
    std::max(static_cast<size_t>(std::floor(runtime_num_threads_min * kActorUsage)), kActorThreadMinNum);
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@ -101,10 +101,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
  set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true);
  set_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER, false);

-  size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
-  constexpr float kCpuUsage = 0.6;
-  uint32_t runtime_num_threads = std::max(static_cast<int>(std::floor(cpu_core_num * kCpuUsage)), 1);
-  set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, runtime_num_threads);
+  uint32_t kDefaultRuntimeNumThreads = 30;
+  set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, kDefaultRuntimeNumThreads);

  backend_policy_ = policy_map_[policy];
 }
--- a/mindspore/python/mindspore/context.py
+++ b/mindspore/python/mindspore/context.py
@ -830,7 +830,7 @@ def set_context(**kwargs):
            The cache will be saved to the directory of `compile_cache_path/rank_${rank_id}/`. The `rank_id` is
            the ID of the current device in the cluster.
        runtime_num_threads(int): The thread pool number of cpu kernel and actor used in runtime,
-            which must bigger than 0. Default value if 0.6 times of the machine threads, if you run many processes at
+            which must bigger than 0. Default value is 30, if you run many processes at
            the same time, you should set the value smaller to avoid thread contention.
    Raises:
        ValueError: If input key is not an attribute in context.
--- a/mindspore/python/mindspore/profiler/parser/integrator.py
+++ b/mindspore/python/mindspore/profiler/parser/integrator.py
@ -1539,6 +1539,8 @@ class AscendTimelineGenerator(BaseTimelineGenerator):
 class CpuTimelineGenerator(GpuTimelineGenerator):
    """Generate cpu Timeline data from file."""
    _output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
+    _display_filename = 'cpu_timeline_display_{}.json'
+    _timeline_summary_filename = 'cpu_timeline_summary_{}.json'

    def _get_and_validate_path(self, file_name):
        """Generate op or activity file path from file name, and validate this path."""
@ -1575,3 +1577,53 @@ class CpuTimelineGenerator(GpuTimelineGenerator):
            time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms

        return timeline_list
+
+    def _load_timeline_data(self):
+        """Load timeline data from file."""
+        timeline_list = self.load_cpu_op_data()
+
+        timeline_list.sort(key=lambda x: float(x[2]))
+        self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
+        self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
+
+        # Generate step time.
+        factor_start_time_uint_to_duration = 1e-3
+        self._set_step_start_and_end_op_name(timeline_list)
+
+        step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)
+
+        # Add Scope Name.
+        default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
+                                                                      factor_start_time_uint_to_duration)
+        gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
+                                                                       factor_start_time_uint_to_duration)
+        recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
+                                                                        factor_start_time_uint_to_duration)
+        timeline_list.extend(default_scope_name_time_list)
+        timeline_list.extend(gradient_scope_name_time_list)
+        timeline_list.extend(recompute_scope_name_time_list)
+        timeline_list.extend(step_time_list)
+
+        timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx]))
+        timeline_list.sort(key=lambda x: float(x[2]))
+
+        return timeline_list
+
+    def init_timeline(self):
+        """Init timeline metadata, adding all collected info."""
+        timeline_list = self._load_timeline_data()
+
+        # Init a dict for counting the num of streams.
+        stream_count_dict = {}
+        for timeline in timeline_list:
+            self._parse_timeline_data(timeline, 0)
+            # Updating the collection of streams.
+            if len(timeline) == 4:
+                self._update_num_of_streams(timeline, stream_count_dict)
+
+        # Add format thread meta data.
+        self._format_meta_data_list.extend(self._timeline_meta)
+        self._timeline_meta = self._format_meta_data_list
+
+        # Update timeline summary info
+        self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
--- a/mindspore/python/mindspore/profiler/profiling.py
+++ b/mindspore/python/mindspore/profiler/profiling.py
@ -34,7 +34,7 @@ from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
 from mindspore.profiler.parser.framework_parser import FrameworkParser
 from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
 from mindspore.profiler.parser.integrator import Integrator
-from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator
+from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator, CpuTimelineGenerator
 from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser
 from mindspore.profiler.parser.minddata_parser import MinddataParser
 from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer
@ -164,6 +164,11 @@ class Profiler:
            self._cpu_profiler = cpu_profiler.get_instance()
            self._cpu_profiler.init(self._output_path)

+        if self._device_target and self._device_target == "CPU":
+            self.start_profile = kwargs.pop("start_profile", True)
+            if not isinstance(self.start_profile, bool):
+                raise TypeError("The parameter start_profile must be bool.")
+
        if self._device_target and self._device_target == "GPU":
            gpu_profiler = c_expression.GPUProfiler
            self._gpu_profiler = gpu_profiler.get_instance()
@ -296,6 +301,9 @@ class Profiler:

        self._cpu_profiler.stop()

+        if self._device_target and self._device_target == "CPU":
+            self._cpu_analyse()
+
        if self._device_target and self._device_target == "GPU":
            self._gpu_analyse()

@ -590,6 +598,21 @@ class Profiler:
            'otherwise, this warning can be ignored.'
        )

+    def _cpu_analyse(self):
+        """Collect and analyse cpu performance data"""
+
+        try:
+            size_limit = 100 * 1024 * 1024  # 100MB
+            timeline_generator = CpuTimelineGenerator(self._output_path, 0)
+            timeline_generator.init_timeline()
+            timeline_generator.write_timeline(size_limit)
+            timeline_generator.write_timeline_summary()
+            return timeline_generator
+        except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
+            logger.warning('Fail to write timeline data: %s', err)
+            raise RuntimeError('Fail to write timeline data.')
+
+
    def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True,
                            is_gpu_kernel_async_launch_flag=False):
        """