forked from mindspore-Ecosystem/mindspore
!30003 add cpu timeline profiling and change threadpool default threads
Merge pull request !30003 from fangzehua/add_profi
This commit is contained in:
commit
d37c8719fd
|
@ -138,7 +138,7 @@ MindSpore context,用于配置当前执行环境,包括执行模式、执行
|
|||
- **grad_for_scalar** (bool): 表示是否获取标量梯度。默认值:False。当 `grad_for_scalar` 设置为True时,则可以导出函数的标量输入。由于后端目前不支持伸缩操作,所以该接口只支持在前端可推演的简单操作。
|
||||
- **enable_compile_cache** (bool) - 表示是否加载或者保存前端编译的图。当 `enable_compile_cache` 被设置为True时,在第一次执行的过程中,一个硬件无关的编译缓存会被生成并且导出为一个MINDIR文件。当该网络被再次执行时,如果 `enable_compile_cache` 仍然为True并且网络脚本没有被更改,那么这个编译缓存会被加载。注意目前只支持有限的Python脚本更改的自动检测,这意味着可能有正确性风险。默认值:False。这是一个实验特性,可能会被更改或者删除。
|
||||
- **compile_cache_path** (str) - 保存前端图编译缓存的路径。默认值:"."。如果目录不存在,系统会自动创建这个目录。缓存会被保存到如下目录: `compile_cache_path/rank_${rank_id}/` 。 `rank_id` 是集群上当前设备的ID。
|
||||
- **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为系统线程数的0.6倍。
|
||||
- **runtime_num_threads** (int) - 运行时线程池的线程数控制。 默认值为30。
|
||||
|
||||
**异常:**
|
||||
|
||||
|
|
|
@ -30,7 +30,7 @@ void ComputeThreadNums(size_t *actor_thread_num, size_t *actor_and_kernel_thread
|
|||
const size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
|
||||
auto runtime_num_threads = static_cast<size_t>(context_ptr->get_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS));
|
||||
size_t runtime_num_threads_min = std::min(runtime_num_threads, cpu_core_num);
|
||||
const float kActorUsage = 0.2;
|
||||
const float kActorUsage = 0.18;
|
||||
const size_t kActorThreadMinNum = 2;
|
||||
size_t actor_thread_max_num =
|
||||
std::max(static_cast<size_t>(std::floor(runtime_num_threads_min * kActorUsage)), kActorThreadMinNum);
|
||||
|
|
|
@ -101,10 +101,8 @@ MsContext::MsContext(const std::string &policy, const std::string &target) {
|
|||
set_param<bool>(MS_CTX_ENABLE_PYNATIVE_OP_GRAPH_CACHE, true);
|
||||
set_param<bool>(MS_CTX_ENABLE_MEM_SCHEDULER, false);
|
||||
|
||||
size_t cpu_core_num = std::thread::hardware_concurrency() - 1;
|
||||
constexpr float kCpuUsage = 0.6;
|
||||
uint32_t runtime_num_threads = std::max(static_cast<int>(std::floor(cpu_core_num * kCpuUsage)), 1);
|
||||
set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, runtime_num_threads);
|
||||
uint32_t kDefaultRuntimeNumThreads = 30;
|
||||
set_param<uint32_t>(MS_CTX_RUNTIME_NUM_THREADS, kDefaultRuntimeNumThreads);
|
||||
|
||||
backend_policy_ = policy_map_[policy];
|
||||
}
|
||||
|
|
|
@ -830,7 +830,7 @@ def set_context(**kwargs):
|
|||
The cache will be saved to the directory of `compile_cache_path/rank_${rank_id}/`. The `rank_id` is
|
||||
the ID of the current device in the cluster.
|
||||
runtime_num_threads(int): The thread pool number of cpu kernel and actor used in runtime,
|
||||
which must bigger than 0. Default value if 0.6 times of the machine threads, if you run many processes at
|
||||
which must bigger than 0. Default value is 30, if you run many processes at
|
||||
the same time, you should set the value smaller to avoid thread contention.
|
||||
Raises:
|
||||
ValueError: If input key is not an attribute in context.
|
||||
|
|
|
@ -1539,6 +1539,8 @@ class AscendTimelineGenerator(BaseTimelineGenerator):
|
|||
class CpuTimelineGenerator(GpuTimelineGenerator):
|
||||
"""Generate cpu Timeline data from file."""
|
||||
_output_op_execute_time_file_path = "cpu_op_execute_timestamp_{}.txt"
|
||||
_display_filename = 'cpu_timeline_display_{}.json'
|
||||
_timeline_summary_filename = 'cpu_timeline_summary_{}.json'
|
||||
|
||||
def _get_and_validate_path(self, file_name):
|
||||
"""Generate op or activity file path from file name, and validate this path."""
|
||||
|
@ -1575,3 +1577,53 @@ class CpuTimelineGenerator(GpuTimelineGenerator):
|
|||
time_item[self._duration_idx] = float(time_item[self._duration_idx]) / factor_us_to_ms
|
||||
|
||||
return timeline_list
|
||||
|
||||
def _load_timeline_data(self):
|
||||
"""Load timeline data from file."""
|
||||
timeline_list = self.load_cpu_op_data()
|
||||
|
||||
timeline_list.sort(key=lambda x: float(x[2]))
|
||||
self._max_scope_name_num = self._get_max_scope_name_num(timeline_list)
|
||||
self._timeline_summary['max_scope_name_num'] = self._max_scope_name_num
|
||||
|
||||
# Generate step time.
|
||||
factor_start_time_uint_to_duration = 1e-3
|
||||
self._set_step_start_and_end_op_name(timeline_list)
|
||||
|
||||
step_time_list = self._get_step_time_list(timeline_list, factor_start_time_uint_to_duration)
|
||||
|
||||
# Add Scope Name.
|
||||
default_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Default",
|
||||
factor_start_time_uint_to_duration)
|
||||
gradient_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "Gradients",
|
||||
factor_start_time_uint_to_duration)
|
||||
recompute_scope_name_time_list = self._get_scope_name_time_list(timeline_list, "recompute_Default",
|
||||
factor_start_time_uint_to_duration)
|
||||
timeline_list.extend(default_scope_name_time_list)
|
||||
timeline_list.extend(gradient_scope_name_time_list)
|
||||
timeline_list.extend(recompute_scope_name_time_list)
|
||||
timeline_list.extend(step_time_list)
|
||||
|
||||
timeline_list.sort(key=lambda x: (float(x[self._start_time_idx]), x[self._tid_idx]))
|
||||
timeline_list.sort(key=lambda x: float(x[2]))
|
||||
|
||||
return timeline_list
|
||||
|
||||
def init_timeline(self):
|
||||
"""Init timeline metadata, adding all collected info."""
|
||||
timeline_list = self._load_timeline_data()
|
||||
|
||||
# Init a dict for counting the num of streams.
|
||||
stream_count_dict = {}
|
||||
for timeline in timeline_list:
|
||||
self._parse_timeline_data(timeline, 0)
|
||||
# Updating the collection of streams.
|
||||
if len(timeline) == 4:
|
||||
self._update_num_of_streams(timeline, stream_count_dict)
|
||||
|
||||
# Add format thread meta data.
|
||||
self._format_meta_data_list.extend(self._timeline_meta)
|
||||
self._timeline_meta = self._format_meta_data_list
|
||||
|
||||
# Update timeline summary info
|
||||
self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
|
||||
|
|
|
@ -34,7 +34,7 @@ from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
|
|||
from mindspore.profiler.parser.framework_parser import FrameworkParser
|
||||
from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
|
||||
from mindspore.profiler.parser.integrator import Integrator
|
||||
from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator
|
||||
from mindspore.profiler.parser.integrator import GpuTimelineGenerator, AscendTimelineGenerator, CpuTimelineGenerator
|
||||
from mindspore.profiler.parser.memory_usage_parser import MemoryUsageParser
|
||||
from mindspore.profiler.parser.minddata_parser import MinddataParser
|
||||
from mindspore.profiler.parser.minddata_analyzer import MinddataProfilingAnalyzer
|
||||
|
@ -164,6 +164,11 @@ class Profiler:
|
|||
self._cpu_profiler = cpu_profiler.get_instance()
|
||||
self._cpu_profiler.init(self._output_path)
|
||||
|
||||
if self._device_target and self._device_target == "CPU":
|
||||
self.start_profile = kwargs.pop("start_profile", True)
|
||||
if not isinstance(self.start_profile, bool):
|
||||
raise TypeError("The parameter start_profile must be bool.")
|
||||
|
||||
if self._device_target and self._device_target == "GPU":
|
||||
gpu_profiler = c_expression.GPUProfiler
|
||||
self._gpu_profiler = gpu_profiler.get_instance()
|
||||
|
@ -296,6 +301,9 @@ class Profiler:
|
|||
|
||||
self._cpu_profiler.stop()
|
||||
|
||||
if self._device_target and self._device_target == "CPU":
|
||||
self._cpu_analyse()
|
||||
|
||||
if self._device_target and self._device_target == "GPU":
|
||||
self._gpu_analyse()
|
||||
|
||||
|
@ -590,6 +598,21 @@ class Profiler:
|
|||
'otherwise, this warning can be ignored.'
|
||||
)
|
||||
|
||||
def _cpu_analyse(self):
|
||||
"""Collect and analyse cpu performance data"""
|
||||
|
||||
try:
|
||||
size_limit = 100 * 1024 * 1024 # 100MB
|
||||
timeline_generator = CpuTimelineGenerator(self._output_path, 0)
|
||||
timeline_generator.init_timeline()
|
||||
timeline_generator.write_timeline(size_limit)
|
||||
timeline_generator.write_timeline_summary()
|
||||
return timeline_generator
|
||||
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
|
||||
logger.warning('Fail to write timeline data: %s', err)
|
||||
raise RuntimeError('Fail to write timeline data.')
|
||||
|
||||
|
||||
def _analyse_step_trace(self, source_path=None, framework_parser=None, is_training_mode_flag=True,
|
||||
is_gpu_kernel_async_launch_flag=False):
|
||||
"""
|
||||
|
|
Loading…
Reference in New Issue