!30003 add cpu timeline profiling and change threadpool default threads

Merge pull request !30003 from fangzehua/add_profi
This commit is contained in:
i-robot 2022-02-15 06:21:15 +00:00 committed by Gitee
commit d37c8719fd
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
6 changed files with 81 additions and 8 deletions

View File

@ -138,7 +138,7 @@ MindSpore context用于配置当前执行环境包括执行模式、执行
- **grad_for_scalar** (bool) 表示是否获取标量梯度。默认值False。当 `grad_for_scalar` 设置为True时则可以导出函数的标量输入。由于后端目前不支持伸缩操作所以该接口只支持在前端可推演的简单操作。
- **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时在第一次执行的过程中一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测这意味着可能有正确性风险。默认值False。这是一个实验特性可能会被更改或者删除。
- **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值:"."。如果目录不存在,系统会自动创建这个目录。缓存会被保存到如下目录: `compile_cache_path/rank_${rank_id}/``rank_id` 是集群上当前设备的ID。
- **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为系统线程数的0.6倍
- **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为30
**异常:**

View File

@ -30,7 +30,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread
const size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
auto runtime_num_threads = static_cast<size_t>(context_ptr->get_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS));
size_t runtime_num_threads_min = std::min(runtime_num_threads, cpu_core_num);
const float kActorUsage = 0.2;
const float kActorUsage = 0.18;
const size_t kActorThreadMinNum = 2;
size_t actor_thread_max_num =
std::max(static_cast<size_t>(std::floor(runtime_num_threads_min * kActorUsage)), kActorThreadMinNum);

View File

@ -101,10 +101,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true);
set_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER, false);
size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
constexpr float kCpuUsage = 0.6;
uint32_t runtime_num_threads = std::max(static_cast<int>(std::floor(cpu_core_num * kCpuUsage)), 1);
set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, runtime_num_threads);
uint32_t kDefaultRuntimeNumThreads = 30;
set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, kDefaultRuntimeNumThreads);
backend_policy_ = policy_map_[policy];
}

View File

@ -830,7 +830,7 @@ def set_context(**kwargs):
The cache will be saved to the directory of `compile_cache_path/rank_${rank_id}/`. The `rank_id` is
the ID of the current device in the cluster.
runtime_num_threads(int): The thread pool number of cpu kernel and actor used in runtime,
which must bigger than 0. Default value if 0.6 times of the machine threads, if you run many processes at
which must bigger than 0. Default value is 30, if you run many processes at
the same time, you should set the value smaller to avoid thread contention.
Raises:
ValueError: If input key is not an attribute in context.

View File

@ -1539,6 +1539,8 @@ class AscendTimelineGenerator(BaseTimelineGenerator):
class CpuTimelineGenerator(GpuTimelineGenerator):
"""Generate cpu Timeline data from file."""
_output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
_display_filename = 'cpu_timeline_display_{}.json'
_timeline_summary_filename = 'cpu_timeline_summary_{}.json'
def _get_and_validate_path(self, file_name):
"""Generate op or activity file path from file name, and validate this path."""
@ -1575,3 +1577,53 @@ class CpuTimelineGenerator(GpuTimelineGenerator):
time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms
return timeline_list
def _load_timeline_data(self):
"""Load timeline data from file."""
timeline_list = self.load_cpu_op_data()
timeline_list.sort(key=lambda x: float(x[2]))
self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
# Generate step time.
factor_start_time_uint_to_duration = 1e-3
self._set_step_start_and_end_op_name(timeline_list)
step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)
# Add Scope Name.
default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
factor_start_time_uint_to_duration)
gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
factor_start_time_uint_to_duration)
recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
factor_start_time_uint_to_duration)
timeline_list.extend(default_scope_name_time_list)
timeline_list.extend(gradient_scope_name_time_list)
timeline_list.extend(recompute_scope_name_time_list)
timeline_list.extend(step_time_list)
timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx]))
timeline_list.sort(key=lambda x: float(x[2]))
return timeline_list
def init_timeline(self):
"""Init timeline metadata, adding all collected info."""
timeline_list = self._load_timeline_data()
# Init a dict for counting the num of streams.
stream_count_dict = {}
for timeline in timeline_list:
self._parse_timeline_data(timeline, 0)
# Updating the collection of streams.
if len(timeline) == 4:
self._update_num_of_streams(timeline, stream_count_dict)
# Add format thread meta data.
self._format_meta_data_list.extend(self._timeline_meta)
self._timeline_meta = self._format_meta_data_list
# Update timeline summary info
self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())

View File

@ -34,7 +34,7 @@ from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
from mindspore.profiler.parser.framework_parser import FrameworkParser
from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
from mindspore.profiler.parser.integrator import Integrator
from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator
from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator, CpuTimelineGenerator
from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser
from mindspore.profiler.parser.minddata_parser import MinddataParser
from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer
@ -164,6 +164,11 @@ class Profiler:
self._cpu_profiler = cpu_profiler.get_instance()
self._cpu_profiler.init(self._output_path)
if self._device_target and self._device_target == "CPU":
self.start_profile = kwargs.pop("start_profile", True)
if not isinstance(self.start_profile, bool):
raise TypeError("The parameter start_profile must be bool.")
if self._device_target and self._device_target == "GPU":
gpu_profiler = c_expression.GPUProfiler
self._gpu_profiler = gpu_profiler.get_instance()
@ -296,6 +301,9 @@ class Profiler:
self._cpu_profiler.stop()
if self._device_target and self._device_target == "CPU":
self._cpu_analyse()
if self._device_target and self._device_target == "GPU":
self._gpu_analyse()
@ -590,6 +598,21 @@ class Profiler:
'otherwise, this warning can be ignored.'
)
def _cpu_analyse(self):
"""Collect and analyse cpu performance data"""
try:
size_limit = 100 * 1024 * 1024 # 100MB
timeline_generator = CpuTimelineGenerator(self._output_path, 0)
timeline_generator.init_timeline()
timeline_generator.write_timeline(size_limit)
timeline_generator.write_timeline_summary()
return timeline_generator
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
logger.warning('Fail to write timeline data: %s', err)
raise RuntimeError('Fail to write timeline data.')
def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True,
is_gpu_kernel_async_launch_flag=False):
"""