From 4bb4417aff97bffc9893ff22529060c65cee64a6 Mon Sep 17 00:00:00 2001
From: fangzehua <fangzehua1@huawei.com>
Date: Fri, 11 Feb 2022 16:24:41 +0800
Subject: [PATCH] support cpu profling timeline

---
 docs/api/api_python/mindspore.context.rst     |  2 +-
 .../graph_scheduler/actor/actor_common.cc     |  2 +-
 mindspore/core/utils/ms_context.cc            |  6 +--
 mindspore/python/mindspore/context.py         |  2 +-
 .../mindspore/profiler/parser/integrator.py   | 52 +++++++++++++++++++
 .../python/mindspore/profiler/profiling.py    | 25 ++++++++-
 6 files changed, 81 insertions(+), 8 deletions(-)
diff --git a/docs/api/api_python/mindspore.context.rst b/docs/api/api_python/mindspore.context.rst
index b7d12308a1e..d18875adf1e 100644
--- a/docs/api/api_python/mindspore.context.rst
+++ b/docs/api/api_python/mindspore.context.rst
@@ -138,7 +138,7 @@ MindSpore context，用于配置当前执行环境，包括执行模式、执行
     - **grad_for_scalar** (bool)：  表示是否获取标量梯度。默认值：False。当 `grad_for_scalar` 设置为True时，则可以导出函数的标量输入。由于后端目前不支持伸缩操作，所以该接口只支持在前端可推演的简单操作。
     - **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时，在第一次执行的过程中，一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时，如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改，那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测，这意味着可能有正确性风险。默认值：False。这是一个实验特性，可能会被更改或者删除。
     - **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值："."。如果目录不存在，系统会自动创建这个目录。缓存会被保存到如下目录： `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。
-    - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为系统线程数的0.6倍。
+    - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为30。
 
     **异常：**
 
diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc
index 7f2798ee0ca..991c60a01b9 100644
--- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc
+++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc
@@ -30,7 +30,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread
   const size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
   auto runtime_num_threads = static_cast<size_t>(context_ptr->get_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS));
   size_t runtime_num_threads_min = std::min(runtime_num_threads, cpu_core_num);
-  const float kActorUsage = 0.2;
+  const float kActorUsage = 0.18;
   const size_t kActorThreadMinNum = 2;
   size_t actor_thread_max_num =
     std::max(static_cast<size_t>(std::floor(runtime_num_threads_min * kActorUsage)), kActorThreadMinNum);
diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc
index 07c00295146..f300e82c709 100644
--- a/mindspore/core/utils/ms_context.cc
+++ b/mindspore/core/utils/ms_context.cc
@@ -101,10 +101,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
   set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true);
   set_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER, false);
 
-  size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
-  constexpr float kCpuUsage = 0.6;
-  uint32_t runtime_num_threads = std::max(static_cast<int>(std::floor(cpu_core_num * kCpuUsage)), 1);
-  set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, runtime_num_threads);
+  uint32_t kDefaultRuntimeNumThreads = 30;
+  set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, kDefaultRuntimeNumThreads);
 
   backend_policy_ = policy_map_[policy];
 }
diff --git a/mindspore/python/mindspore/context.py b/mindspore/python/mindspore/context.py
index 99fcdbcc647..06b6754b11a 100644
--- a/mindspore/python/mindspore/context.py
+++ b/mindspore/python/mindspore/context.py
@@ -830,7 +830,7 @@ def set_context(**kwargs):
             The cache will be saved to the directory of `compile_cache_path/rank_${rank_id}/`. The `rank_id` is
             the ID of the current device in the cluster.
         runtime_num_threads(int): The thread pool number of cpu kernel and actor used in runtime,
-            which must bigger than 0. Default value if 0.6 times of the machine threads, if you run many processes at
+            which must bigger than 0. Default value is 30, if you run many processes at
             the same time, you should set the value smaller to avoid thread contention.
     Raises:
         ValueError: If input key is not an attribute in context.
diff --git a/mindspore/python/mindspore/profiler/parser/integrator.py b/mindspore/python/mindspore/profiler/parser/integrator.py
index b391557d7bd..a4edc378624 100644
--- a/mindspore/python/mindspore/profiler/parser/integrator.py
+++ b/mindspore/python/mindspore/profiler/parser/integrator.py
@@ -1539,6 +1539,8 @@ class AscendTimelineGenerator(BaseTimelineGenerator):
 class CpuTimelineGenerator(GpuTimelineGenerator):
     """Generate cpu Timeline data from file."""
     _output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
+    _display_filename = 'cpu_timeline_display_{}.json'
+    _timeline_summary_filename = 'cpu_timeline_summary_{}.json'
 
     def _get_and_validate_path(self, file_name):
         """Generate op or activity file path from file name, and validate this path."""
@@ -1575,3 +1577,53 @@ class CpuTimelineGenerator(GpuTimelineGenerator):
             time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms
 
         return timeline_list
+
+    def _load_timeline_data(self):
+        """Load timeline data from file."""
+        timeline_list = self.load_cpu_op_data()
+
+        timeline_list.sort(key=lambda x: float(x[2]))
+        self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
+        self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
+
+        # Generate step time.
+        factor_start_time_uint_to_duration = 1e-3
+        self._set_step_start_and_end_op_name(timeline_list)
+
+        step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)
+
+        # Add Scope Name.
+        default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
+                                                                      factor_start_time_uint_to_duration)
+        gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
+                                                                       factor_start_time_uint_to_duration)
+        recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
+                                                                        factor_start_time_uint_to_duration)
+        timeline_list.extend(default_scope_name_time_list)
+        timeline_list.extend(gradient_scope_name_time_list)
+        timeline_list.extend(recompute_scope_name_time_list)
+        timeline_list.extend(step_time_list)
+
+        timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx]))
+        timeline_list.sort(key=lambda x: float(x[2]))
+
+        return timeline_list
+
+    def init_timeline(self):
+        """Init timeline metadata, adding all collected info."""
+        timeline_list = self._load_timeline_data()
+
+        # Init a dict for counting the num of streams.
+        stream_count_dict = {}
+        for timeline in timeline_list:
+            self._parse_timeline_data(timeline, 0)
+            # Updating the collection of streams.
+            if len(timeline) == 4:
+                self._update_num_of_streams(timeline, stream_count_dict)
+
+        # Add format thread meta data.
+        self._format_meta_data_list.extend(self._timeline_meta)
+        self._timeline_meta = self._format_meta_data_list
+
+        # Update timeline summary info
+        self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
diff --git a/mindspore/python/mindspore/profiler/profiling.py b/mindspore/python/mindspore/profiler/profiling.py
index 99c5a13286b..4ab1d66a014 100644
--- a/mindspore/python/mindspore/profiler/profiling.py
+++ b/mindspore/python/mindspore/profiler/profiling.py
@@ -34,7 +34,7 @@ from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
 from mindspore.profiler.parser.framework_parser import FrameworkParser
 from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
 from mindspore.profiler.parser.integrator import Integrator
-from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator
+from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator, CpuTimelineGenerator
 from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser
 from mindspore.profiler.parser.minddata_parser import MinddataParser
 from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer
@@ -164,6 +164,11 @@ class Profiler:
             self._cpu_profiler = cpu_profiler.get_instance()
             self._cpu_profiler.init(self._output_path)
 
+        if self._device_target and self._device_target == "CPU":
+            self.start_profile = kwargs.pop("start_profile", True)
+            if not isinstance(self.start_profile, bool):
+                raise TypeError("The parameter start_profile must be bool.")
+
         if self._device_target and self._device_target == "GPU":
             gpu_profiler = c_expression.GPUProfiler
             self._gpu_profiler = gpu_profiler.get_instance()
@@ -296,6 +301,9 @@ class Profiler:
 
         self._cpu_profiler.stop()
 
+        if self._device_target and self._device_target == "CPU":
+            self._cpu_analyse()
+
         if self._device_target and self._device_target == "GPU":
             self._gpu_analyse()
 
@@ -590,6 +598,21 @@ class Profiler:
             'otherwise, this warning can be ignored.'
         )
 
+    def _cpu_analyse(self):
+        """Collect and analyse cpu performance data"""
+
+        try:
+            size_limit = 100 * 1024 * 1024  # 100MB
+            timeline_generator = CpuTimelineGenerator(self._output_path, 0)
+            timeline_generator.init_timeline()
+            timeline_generator.write_timeline(size_limit)
+            timeline_generator.write_timeline_summary()
+            return timeline_generator
+        except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
+            logger.warning('Fail to write timeline data: %s', err)
+            raise RuntimeError('Fail to write timeline data.')
+
+
     def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True,
                             is_gpu_kernel_async_launch_flag=False):
         """