From 4bb4417aff97bffc9893ff22529060c65cee64a6 Mon Sep 17 00:00:00 2001 From: fangzehua Date: Fri, 11 Feb 2022 16:24:41 +0800 Subject: [PATCH] support cpu profling timeline --- docs/api/api_python/mindspore.context.rst | 2 +- .../graph_scheduler/actor/actor_common.cc | 2 +- mindspore/core/utils/ms_context.cc | 6 +-- mindspore/python/mindspore/context.py | 2 +- .../mindspore/profiler/parser/integrator.py | 52 +++++++++++++++++++ .../python/mindspore/profiler/profiling.py | 25 ++++++++- 6 files changed, 81 insertions(+), 8 deletions(-) diff --git a/docs/api/api_python/mindspore.context.rst b/docs/api/api_python/mindspore.context.rst index b7d12308a1e..d18875adf1e 100644 --- a/docs/api/api_python/mindspore.context.rst +++ b/docs/api/api_python/mindspore.context.rst @@ -138,7 +138,7 @@ MindSpore context,用于配置当前执行环境,包括执行模式、执行 - **grad_for_scalar** (bool): 表示是否获取标量梯度。默认值:False。当 `grad_for_scalar` 设置为True时,则可以导出函数的标量输入。由于后端目前不支持伸缩操作,所以该接口只支持在前端可推演的简单操作。 - **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时,在第一次执行的过程中,一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时,如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改,那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测,这意味着可能有正确性风险。默认值:False。这是一个实验特性,可能会被更改或者删除。 - **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值:"."。如果目录不存在,系统会自动创建这个目录。缓存会被保存到如下目录: `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。 - - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为系统线程数的0.6倍。 + - **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为30。 **异常:** diff --git a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc index 7f2798ee0ca..991c60a01b9 100644 --- a/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc +++ b/mindspore/ccsrc/runtime/graph_scheduler/actor/actor_common.cc @@ -30,7 +30,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread const size_t cpu_core_num = std::thread::hardware_concurrency() - 1; auto runtime_num_threads = static_cast(context_ptr->get_param(MS_CTX_RUNTIME_NUM_THREADS)); size_t runtime_num_threads_min = std::min(runtime_num_threads, cpu_core_num); - const float kActorUsage = 0.2; + const float kActorUsage = 0.18; const size_t kActorThreadMinNum = 2; size_t actor_thread_max_num = std::max(static_cast(std::floor(runtime_num_threads_min * kActorUsage)), kActorThreadMinNum); diff --git a/mindspore/core/utils/ms_context.cc b/mindspore/core/utils/ms_context.cc index 07c00295146..f300e82c709 100644 --- a/mindspore/core/utils/ms_context.cc +++ b/mindspore/core/utils/ms_context.cc @@ -101,10 +101,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) { set_param(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true); set_param(MS_CTX_ENABLE_MEM_SCHEDULER, false); - size_t cpu_core_num = std::thread::hardware_concurrency() - 1; - constexpr float kCpuUsage = 0.6; - uint32_t runtime_num_threads = std::max(static_cast(std::floor(cpu_core_num * kCpuUsage)), 1); - set_param(MS_CTX_RUNTIME_NUM_THREADS, runtime_num_threads); + uint32_t kDefaultRuntimeNumThreads = 30; + set_param(MS_CTX_RUNTIME_NUM_THREADS, kDefaultRuntimeNumThreads); backend_policy_ = policy_map_[policy]; } diff --git a/mindspore/python/mindspore/context.py b/mindspore/python/mindspore/context.py index 99fcdbcc647..06b6754b11a 100644 --- a/mindspore/python/mindspore/context.py +++ b/mindspore/python/mindspore/context.py @@ -830,7 +830,7 @@ def set_context(**kwargs): The cache will be saved to the directory of `compile_cache_path/rank_${rank_id}/`. The `rank_id` is the ID of the current device in the cluster. runtime_num_threads(int): The thread pool number of cpu kernel and actor used in runtime, - which must bigger than 0. Default value if 0.6 times of the machine threads, if you run many processes at + which must bigger than 0. Default value is 30, if you run many processes at the same time, you should set the value smaller to avoid thread contention. Raises: ValueError: If input key is not an attribute in context. diff --git a/mindspore/python/mindspore/profiler/parser/integrator.py b/mindspore/python/mindspore/profiler/parser/integrator.py index b391557d7bd..a4edc378624 100644 --- a/mindspore/python/mindspore/profiler/parser/integrator.py +++ b/mindspore/python/mindspore/profiler/parser/integrator.py @@ -1539,6 +1539,8 @@ class AscendTimelineGenerator(BaseTimelineGenerator): class CpuTimelineGenerator(GpuTimelineGenerator): """Generate cpu Timeline data from file.""" _output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt" + _display_filename = 'cpu_timeline_display_{}.json' + _timeline_summary_filename = 'cpu_timeline_summary_{}.json' def _get_and_validate_path(self, file_name): """Generate op or activity file path from file name, and validate this path.""" @@ -1575,3 +1577,53 @@ class CpuTimelineGenerator(GpuTimelineGenerator): time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms return timeline_list + + def _load_timeline_data(self): + """Load timeline data from file.""" + timeline_list = self.load_cpu_op_data() + + timeline_list.sort(key=lambda x: float(x[2])) + self._max_scope_name_num = self._get_max_scope_name_num(timeline_list) + self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num + + # Generate step time. + factor_start_time_uint_to_duration = 1e-3 + self._set_step_start_and_end_op_name(timeline_list) + + step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration) + + # Add Scope Name. + default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default", + factor_start_time_uint_to_duration) + gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients", + factor_start_time_uint_to_duration) + recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default", + factor_start_time_uint_to_duration) + timeline_list.extend(default_scope_name_time_list) + timeline_list.extend(gradient_scope_name_time_list) + timeline_list.extend(recompute_scope_name_time_list) + timeline_list.extend(step_time_list) + + timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx])) + timeline_list.sort(key=lambda x: float(x[2])) + + return timeline_list + + def init_timeline(self): + """Init timeline metadata, adding all collected info.""" + timeline_list = self._load_timeline_data() + + # Init a dict for counting the num of streams. + stream_count_dict = {} + for timeline in timeline_list: + self._parse_timeline_data(timeline, 0) + # Updating the collection of streams. + if len(timeline) == 4: + self._update_num_of_streams(timeline, stream_count_dict) + + # Add format thread meta data. + self._format_meta_data_list.extend(self._timeline_meta) + self._timeline_meta = self._format_meta_data_list + + # Update timeline summary info + self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys()) diff --git a/mindspore/python/mindspore/profiler/profiling.py b/mindspore/python/mindspore/profiler/profiling.py index 99c5a13286b..4ab1d66a014 100644 --- a/mindspore/python/mindspore/profiler/profiling.py +++ b/mindspore/python/mindspore/profiler/profiling.py @@ -34,7 +34,7 @@ from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser from mindspore.profiler.parser.framework_parser import FrameworkParser from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser from mindspore.profiler.parser.integrator import Integrator -from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator +from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator, CpuTimelineGenerator from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser from mindspore.profiler.parser.minddata_parser import MinddataParser from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer @@ -164,6 +164,11 @@ class Profiler: self._cpu_profiler = cpu_profiler.get_instance() self._cpu_profiler.init(self._output_path) + if self._device_target and self._device_target == "CPU": + self.start_profile = kwargs.pop("start_profile", True) + if not isinstance(self.start_profile, bool): + raise TypeError("The parameter start_profile must be bool.") + if self._device_target and self._device_target == "GPU": gpu_profiler = c_expression.GPUProfiler self._gpu_profiler = gpu_profiler.get_instance() @@ -296,6 +301,9 @@ class Profiler: self._cpu_profiler.stop() + if self._device_target and self._device_target == "CPU": + self._cpu_analyse() + if self._device_target and self._device_target == "GPU": self._gpu_analyse() @@ -590,6 +598,21 @@ class Profiler: 'otherwise, this warning can be ignored.' ) + def _cpu_analyse(self): + """Collect and analyse cpu performance data""" + + try: + size_limit = 100 * 1024 * 1024 # 100MB + timeline_generator = CpuTimelineGenerator(self._output_path, 0) + timeline_generator.init_timeline() + timeline_generator.write_timeline(size_limit) + timeline_generator.write_timeline_summary() + return timeline_generator + except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err: + logger.warning('Fail to write timeline data: %s', err) + raise RuntimeError('Fail to write timeline data.') + + def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True, is_gpu_kernel_async_launch_flag=False): """