forked from mindspore-Ecosystem/mindspore
!47494 环境变量使能profiler与profiler接口方式参数统一
Merge pull request !47494 from liuchuting/env_p
This commit is contained in:
commit
5708472537
|
@ -18,6 +18,7 @@ import os
|
|||
import time
|
||||
from enum import Enum
|
||||
from mindspore.profiler import Profiler
|
||||
from mindspore.profiler.profiling import AICORE_METRICS_DICT
|
||||
from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
|
||||
from mindspore.profiler.parser.integrator import DeviceTarget
|
||||
|
||||
|
@ -27,9 +28,9 @@ from mindspore import log as logger, context
|
|||
class DeviceSupportParam(Enum):
|
||||
"""The device target enum."""
|
||||
CPU = ['start', 'output_path']
|
||||
GPU = ['start', 'output_path', 'data_process', 'timeline_limit_size', 'sync_enable']
|
||||
ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit_size', 'memory', 'parallel_strategy', 'hccl',
|
||||
'aicore_metrics', 'l2_cache']
|
||||
GPU = ['start', 'output_path', 'data_process', 'timeline_limit', 'sync_enable']
|
||||
ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit', 'profile_memory', 'parallel_strategy',
|
||||
'profile_communication', 'aicore_metrics', 'l2_cache']
|
||||
|
||||
|
||||
def get_profiling_options():
|
||||
|
@ -68,7 +69,7 @@ def construct_profiling_options():
|
|||
conbine_options["output_path"] = validate_and_normalize_path(output_path)
|
||||
if not os.path.exists(output_path):
|
||||
os.makedirs(output_path, exist_ok=True)
|
||||
conbine_options["profiler_path"] = os.path.join(output_path, "profiler")
|
||||
conbine_options["file_output_path"] = os.path.join(output_path, "profiler")
|
||||
return conbine_options
|
||||
|
||||
|
||||
|
@ -79,16 +80,18 @@ def parse_pubilc_args(options):
|
|||
"The 'start' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
|
||||
f" but got type {type(options.get('start'))}, it will be set to false.")
|
||||
options["start"] = False
|
||||
if not isinstance(options.get("data_process_enable"), bool):
|
||||
if not options.get("start"):
|
||||
return options
|
||||
if not isinstance(options.get("data_process"), bool):
|
||||
logger.warning(
|
||||
"The 'data_process' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
|
||||
f" but got type {type(options.get('data_process_enable'))}, it will be set to true.")
|
||||
options["data_process_enable"] = True
|
||||
if not isinstance(options.get("timeline_limit_size"), int):
|
||||
f" but got type {type(options.get('data_process'))}, it will be set to true.")
|
||||
options["data_process"] = True
|
||||
if not isinstance(options.get("timeline_limit"), int):
|
||||
logger.warning(
|
||||
"The 'timeline_limit_size' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
|
||||
f" but got type {type(options.get('timeline_limit_size'))}, it will be set to 500.")
|
||||
options["timeline_limit_size"] = 500
|
||||
"The 'timeline_limit' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
|
||||
f" but got type {type(options.get('timeline_limit'))}, it will be set to 500.")
|
||||
options["timeline_limit"] = 500
|
||||
absolute_path = os.path.join(os.getcwd(), "data")
|
||||
if not isinstance(options.get("output_path"), str):
|
||||
logger.warning(
|
||||
|
@ -117,23 +120,23 @@ def parse_ascend_args(options):
|
|||
"""Parsing ascend profiling args."""
|
||||
if not isinstance(options.get("profile_memory"), bool):
|
||||
logger.warning(
|
||||
"The 'memory' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
|
||||
"The 'profile_memory' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
|
||||
f" but got type {type(options.get('profile_memory'))}, it will be set to false.")
|
||||
options["profile_memory"] = False
|
||||
if not isinstance(options.get("parallel_strategy_enable"), bool):
|
||||
if not isinstance(options.get("parallel_strategy"), bool):
|
||||
logger.warning(
|
||||
"The 'parallel_strategy' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
|
||||
f" but got type {type(options.get('parallel_strategy_enable'))}, it will be set to true.")
|
||||
options["parallel_strategy_enable"] = True
|
||||
f" but got type {type(options.get('parallel_strategy'))}, it will be set to true.")
|
||||
options["parallel_strategy"] = True
|
||||
if not isinstance(options.get("profile_communication"), bool):
|
||||
logger.warning(
|
||||
"The 'hccl' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
|
||||
"The 'profile_communication' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
|
||||
f" but got type {type(options.get('profile_communication'))}, it will be set to false.")
|
||||
options["profile_communication"] = False
|
||||
if not isinstance(options.get("aicore_metrics"), int):
|
||||
if options.get("aicore_metrics") not in AICORE_METRICS_DICT:
|
||||
logger.warning(
|
||||
"The 'aicore_metrics' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
|
||||
f" but got type {type(options.get('aicore_metrics'))}, it will be set to 0.")
|
||||
"The 'aicore_metrics' parameter of the environment variable MS_PROFILE_OPTIONS must be in "
|
||||
f"[-1, 0, 1, 2, 3, 4, 5], but got {options.get('aicore_metrics')}, it will be set to 0.")
|
||||
options["aicore_metrics"] = 0
|
||||
if not isinstance(options.get("l2_cache"), bool):
|
||||
logger.warning(
|
||||
|
@ -146,6 +149,8 @@ def parse_ascend_args(options):
|
|||
def parse_profiling_args(options):
|
||||
"""Parsing profiling args."""
|
||||
profiling_options = parse_pubilc_args(options)
|
||||
if not profiling_options.get("start"):
|
||||
return profiling_options
|
||||
if context.get_context("device_target").lower() == DeviceTarget.ASCEND.value:
|
||||
options = parse_ascend_args(profiling_options)
|
||||
if context.get_context("device_target").lower() == DeviceTarget.GPU.value:
|
||||
|
@ -157,20 +162,24 @@ def combine_profile_options(profiling_options):
|
|||
"""Combined profiling options."""
|
||||
factor_s_to_us = 1e7
|
||||
output_path = os.path.join(os.getcwd(), "data")
|
||||
if context.get_context("device_target").upper() == "GPU":
|
||||
sync_enable = profiling_options.get("sync_enable", True)
|
||||
else:
|
||||
sync_enable = profiling_options.get("sync_enable", False)
|
||||
options = {
|
||||
"start": profiling_options.get('start', False),
|
||||
"start_time": int(time.time() * factor_s_to_us),
|
||||
"pid": os.getpid(),
|
||||
"output_path": profiling_options.get('output_path', output_path),
|
||||
"profiler_path": "",
|
||||
"profile_memory": profiling_options.get("memory", False),
|
||||
"profile_communication": profiling_options.get("hccl", False),
|
||||
"file_output_path": "",
|
||||
"profile_memory": profiling_options.get("profile_memory", False),
|
||||
"profile_communication": profiling_options.get("profile_communication", False),
|
||||
"aicore_metrics": profiling_options.get("aicore_metrics", 0),
|
||||
"l2_cache": profiling_options.get("l2_cache", False),
|
||||
"sync_enable": profiling_options.get("sync_enable", True),
|
||||
"data_process_enable": profiling_options.get("data_process", True),
|
||||
"timeline_limit_size": profiling_options.get("timeline_limit_size", 500),
|
||||
"parallel_strategy_enable": profiling_options.get("parallel_strategy", True),
|
||||
"sync_enable": sync_enable,
|
||||
"data_process": profiling_options.get("data_process", True),
|
||||
"timeline_limit": profiling_options.get("timeline_limit", 500),
|
||||
"parallel_strategy": profiling_options.get("parallel_strategy", True),
|
||||
}
|
||||
return options
|
||||
|
||||
|
@ -179,50 +188,18 @@ class EnvProfiler:
|
|||
"""Collect and analyze training performance data, support calls during and after training."""
|
||||
|
||||
def __init__(self):
|
||||
self._profiling_options = ''
|
||||
self._output_path = False
|
||||
self.profile_memory = False
|
||||
self.profile_communication = False
|
||||
self.aicore_metrics = 0
|
||||
self.l2_cache = False
|
||||
self.sync_enable = True
|
||||
self.start_time = 0
|
||||
self.parallel_strategy_enable = True
|
||||
self.timeline_limit_size = 500
|
||||
self.data_process_enable = True
|
||||
self._profiling_options = {}
|
||||
|
||||
def analyse(self):
|
||||
"""Determine whether to stop collecting and parsing performance data based on environment variables."""
|
||||
if not os.getenv("MS_PROFILER_OPTIONS"):
|
||||
return
|
||||
options = json.loads(os.getenv("MS_PROFILER_RUN_CONFIG", "{}"))
|
||||
if not options.get("pid", 0) == os.getpid():
|
||||
self._profiling_options = json.loads(os.getenv("MS_PROFILER_RUN_CONFIG", "{}"))
|
||||
if not self._profiling_options.get("pid", 0) == os.getpid():
|
||||
return
|
||||
if not options.get("start"):
|
||||
if not self._profiling_options.get("start"):
|
||||
return
|
||||
self._output_path = options.get("profiler_path")
|
||||
self.profile_memory = options.get("profile_memory")
|
||||
self.profile_communication = options.get("profile_communication")
|
||||
self.aicore_metrics = options.get("aicore_metrics")
|
||||
self.l2_cache = options.get("l2_cache")
|
||||
self.sync_enable = options.get("sync_enable")
|
||||
self.parallel_strategy_enable = options.get("parallel_strategy_enable")
|
||||
self.timeline_limit_size = options.get("timeline_limit_size")
|
||||
self.data_process_enable = options.get("data_process_enable")
|
||||
self.start_time = options.get("start_time")
|
||||
options = {
|
||||
"output_path": self._output_path,
|
||||
"profile_memory": self.profile_memory,
|
||||
"profile_communication": self.profile_communication,
|
||||
"aicore_metrics": self.aicore_metrics,
|
||||
"l2_cache": self.l2_cache,
|
||||
"start_time": self.start_time,
|
||||
"sync_enable": self.sync_enable,
|
||||
"parallel_strategy_enable": self.parallel_strategy_enable,
|
||||
"timeline_limit_size": self.timeline_limit_size,
|
||||
"data_process_enable": self.data_process_enable
|
||||
}
|
||||
profiler = Profiler(env_enable=options)
|
||||
profiler = Profiler(env_enable=self._profiling_options)
|
||||
profiler.analyse()
|
||||
|
||||
|
||||
|
@ -233,18 +210,18 @@ def profiler_check_env():
|
|||
if os.getenv("MS_PROFILER_RUN_CONFIG"):
|
||||
return
|
||||
config = construct_profiling_options()
|
||||
os.environ["MS_PROFILER_RUN_CONFIG"] = json.dumps(config)
|
||||
if not config.get("start"):
|
||||
return
|
||||
os.environ["MS_PROFILER_RUN_CONFIG"] = json.dumps(config)
|
||||
Profiler(output_path=config.get("output_path"),
|
||||
profile_memory=config.get("profile_memory"),
|
||||
profile_communication=config.get("profile_communication"),
|
||||
data_process_enable=config.get("data_process_enable"),
|
||||
parallel_strategy_enable=config.get("parallel_strategy_enable"),
|
||||
data_process=config.get("data_process"),
|
||||
parallel_strategy=config.get("parallel_strategy"),
|
||||
aicore_metrics=config.get("aicore_metrics"),
|
||||
l2_cache=config.get("l2_cache"),
|
||||
sync_enable=config.get("sync_enable"),
|
||||
timeline_limit_size=config.get("timeline_limit_size"))
|
||||
timeline_limit=config.get("timeline_limit"))
|
||||
|
||||
|
||||
profiler_check_env()
|
||||
|
|
|
@ -51,6 +51,16 @@ from mindspore.profiler.parser.profiler_info import ProfilerInfo
|
|||
|
||||
INIT_OP_NAME = 'Default/InitDataSetQueue'
|
||||
|
||||
AICORE_METRICS_DICT = {
|
||||
0: "ArithmeticUtilization",
|
||||
1: "PipeUtilization",
|
||||
2: "Memory",
|
||||
3: "MemoryL0",
|
||||
4: "ResourceConflictRatio",
|
||||
5: "MemoryUB",
|
||||
-1: "None"
|
||||
}
|
||||
|
||||
|
||||
def _environment_check():
|
||||
if c_expression.security.enable_security():
|
||||
|
@ -153,15 +163,6 @@ class Profiler:
|
|||
_has_initialized = False
|
||||
_ascend_profiling_options = ""
|
||||
_ascend_job_id = ""
|
||||
_aicore_metrics_dict = {
|
||||
0: "ArithmeticUtilization",
|
||||
1: "PipeUtilization",
|
||||
2: "Memory",
|
||||
3: "MemoryL0",
|
||||
4: "ResourceConflictRatio",
|
||||
5: "MemoryUB",
|
||||
-1: "None"
|
||||
}
|
||||
|
||||
def __init__(self, **kwargs):
|
||||
self._msprof_enable = os.getenv("PROFILER_SAMPLECONFIG")
|
||||
|
@ -195,7 +196,7 @@ class Profiler:
|
|||
# default aicore_metrics type is ArithmeticUtilization
|
||||
self._aicore_metrics_id = 0
|
||||
self._l2_cache = "off"
|
||||
self._data_process_enable = True
|
||||
self._data_process = True
|
||||
self._parser_kwargs(kwargs)
|
||||
# get device_id and device_target
|
||||
self._get_devid_rankid_and_devtarget()
|
||||
|
@ -399,11 +400,11 @@ class Profiler:
|
|||
self._cpu_profiler.step_profiling_enable(True)
|
||||
|
||||
if self._device_target and self._device_target == DeviceTarget.GPU.value:
|
||||
if self._data_process_enable:
|
||||
if self._data_process:
|
||||
self._md_profiler.start()
|
||||
self._gpu_profiler.step_profiling_enable(True)
|
||||
elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
|
||||
if self._data_process_enable:
|
||||
if self._data_process:
|
||||
self._md_profiler.start()
|
||||
self._ascend_graph_start()
|
||||
ProfilerInfo.set_profiling_start_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
|
||||
|
@ -449,7 +450,7 @@ class Profiler:
|
|||
# No need to stop anything if parse profiling data offline
|
||||
if self._is_offline_parser():
|
||||
return
|
||||
if self._data_process_enable:
|
||||
if self._data_process:
|
||||
self._md_profiler.stop()
|
||||
self._md_profiler.save(self._output_path)
|
||||
|
||||
|
@ -471,16 +472,16 @@ class Profiler:
|
|||
self._is_heterogeneous = False
|
||||
self._rank_size = 1
|
||||
self._start_time = options.get("start_time")
|
||||
self._output_path = options.get('output_path')
|
||||
self._output_path = options.get('file_output_path')
|
||||
self._profile_memory = options.get('profile_memory')
|
||||
self._parallel_strategy_enable = options.get('parallel_strategy_enable')
|
||||
self._timeline_size_limit_byte = options.get('timeline_limit_size')
|
||||
self._data_process_enable = options.get('data_process_enable')
|
||||
self._parallel_strategy = options.get('parallel_strategy')
|
||||
self._timeline_size_limit_byte = options.get('timeline_limit')
|
||||
self._data_process = options.get('data_process')
|
||||
self._profile_communication = options.get('profile_communication')
|
||||
self._device_target = context.get_context("device_target").lower()
|
||||
self._profiler_manager = c_expression.ProfilerManager.get_instance()
|
||||
self._cpu_profiler = c_expression.Profiler.get_instance("CPU")
|
||||
if self._data_process_enable:
|
||||
if self._data_process:
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
if self._device_target == DeviceTarget.GPU.value:
|
||||
self._gpu_profiler = c_expression.Profiler.get_instance("GPU")
|
||||
|
@ -532,7 +533,7 @@ class Profiler:
|
|||
def _gpu_profiler_init(self, kwargs):
|
||||
"""Gpu profiler init."""
|
||||
# Setup and start MindData Profiling
|
||||
if self._data_process_enable:
|
||||
if self._data_process:
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
self._md_profiler.init()
|
||||
self._parse_parameter_for_gpu(kwargs)
|
||||
|
@ -549,7 +550,7 @@ class Profiler:
|
|||
def _ascend_profiler_init(self, kwargs):
|
||||
"""Ascend profiler init."""
|
||||
# Setup and start MindData Profiling
|
||||
if self._data_process_enable:
|
||||
if self._data_process:
|
||||
self._md_profiler = cde.GlobalContext.profiling_manager()
|
||||
self._md_profiler.init()
|
||||
self._init_time = int(time.time() * 10000000)
|
||||
|
@ -594,12 +595,12 @@ class Profiler:
|
|||
"bp_point": bp_point,
|
||||
"training_trace": "on",
|
||||
"task_trace": "on",
|
||||
"aic_metrics": Profiler._aicore_metrics_dict.get(self._aicore_metrics_id, "ArithmeticUtilization"),
|
||||
"aic_metrics": AICORE_METRICS_DICT.get(self._aicore_metrics_id, "ArithmeticUtilization"),
|
||||
"aicpu": "on",
|
||||
"profile_memory": profile_memory,
|
||||
"hccl": profiler_communication,
|
||||
"l2_cache": self._l2_cache,
|
||||
"parallel_strategy": "on" if self._parallel_strategy_enable else "off",
|
||||
"parallel_strategy": "on" if self._parallel_strategy else "off",
|
||||
}
|
||||
|
||||
return profiling_options
|
||||
|
@ -650,9 +651,31 @@ class Profiler:
|
|||
raise TypeError(f"For '{self.__class__.__name__}', the parameter profile_memory must be bool, "
|
||||
f"but got type '{type(self._profile_memory)}'")
|
||||
|
||||
self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
|
||||
if not isinstance(self._aicore_metrics_id, int):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
|
||||
f"but got type {type(self._aicore_metrics_id)}")
|
||||
if self._aicore_metrics_id not in AICORE_METRICS_DICT:
|
||||
raise ValueError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
|
||||
f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}")
|
||||
|
||||
l2_cache_enable = kwargs.pop("l2_cache", False)
|
||||
if not isinstance(l2_cache_enable, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
|
||||
f"but got type {type(l2_cache_enable)}")
|
||||
if l2_cache_enable:
|
||||
self._l2_cache = "on"
|
||||
else:
|
||||
self._l2_cache = "off"
|
||||
|
||||
self._parallel_strategy = kwargs.pop("parallel_strategy", True)
|
||||
if not isinstance(self._parallel_strategy, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter parallel_strategy must be bool, "
|
||||
f"but got type {type(self._parallel_strategy)}")
|
||||
|
||||
self._sync_enable = kwargs.pop("sync_enable", False)
|
||||
if self._sync_enable:
|
||||
logger.warning(f"The parameter sync_enabl is not supported on Ascend currently.")
|
||||
logger.warning(f"The parameter sync_enable is not supported on Ascend currently.")
|
||||
|
||||
if kwargs:
|
||||
logger.warning("%s are invalid params which don't work.", kwargs)
|
||||
|
@ -1312,38 +1335,16 @@ class Profiler:
|
|||
|
||||
def _parser_kwargs(self, kwargs):
|
||||
"""Parse kwargs vale."""
|
||||
self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
|
||||
if not isinstance(self._aicore_metrics_id, int):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
|
||||
f"but got type {type(self._aicore_metrics_id)}")
|
||||
if self._aicore_metrics_id not in self._aicore_metrics_dict:
|
||||
raise ValueError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
|
||||
f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}")
|
||||
self._data_process = kwargs.pop("data_process", True)
|
||||
if not isinstance(self._data_process, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process must be bool, "
|
||||
f"but got type {type(self._data_process)}")
|
||||
|
||||
l2_cache_enable = kwargs.pop("l2_cache", False)
|
||||
if not isinstance(l2_cache_enable, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
|
||||
f"but got type {type(l2_cache_enable)}")
|
||||
if l2_cache_enable:
|
||||
self._l2_cache = "on"
|
||||
else:
|
||||
self._l2_cache = "off"
|
||||
|
||||
self._data_process_enable = kwargs.pop("data_process_enable", True)
|
||||
if not isinstance(self._data_process_enable, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process_enable must be bool, "
|
||||
f"but got type {type(self.data_process_enable)}")
|
||||
|
||||
timeline_limit_size = kwargs.pop("timeline_limit_size", 500)
|
||||
if not isinstance(timeline_limit_size, int):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter timeline_limit_size must be int, "
|
||||
f"but got type {type(self.timeline_limit_size)}")
|
||||
self._timeline_size_limit_byte = timeline_limit_size * 1024 * 1024
|
||||
|
||||
self._parallel_strategy_enable = kwargs.pop("parallel_strategy_enable", True)
|
||||
if not isinstance(self._parallel_strategy_enable, bool):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter parallel_strategy_enable must be bool, "
|
||||
f"but got type {type(self._parallel_strategy_enable)}")
|
||||
timeline_limit = kwargs.pop("timeline_limit", 500)
|
||||
if not isinstance(timeline_limit, int):
|
||||
raise TypeError(f"For '{self.__class__.__name__}', the parameter timeline_limit must be int, "
|
||||
f"but got type {type(timeline_limit)}")
|
||||
self._timeline_size_limit_byte = timeline_limit * 1024 * 1024
|
||||
|
||||
def _analyse_hccl_info(self):
|
||||
"""Analyse hccl info."""
|
||||
|
|
|
@ -122,7 +122,7 @@ class TestEnvEnableProfiler:
|
|||
@security_off_wrap
|
||||
def test_gpu_profiler(self):
|
||||
status = os.system(
|
||||
"""export MS_PROFILER_OPTIONS='{"start":true, "memory":true, "sync_enable":true}';
|
||||
"""export MS_PROFILER_OPTIONS='{"start":true, "profile_memory":true, "sync_enable":true}';
|
||||
python ./run_net.py --target=GPU --mode=0;
|
||||
"""
|
||||
)
|
||||
|
@ -154,7 +154,7 @@ class TestEnvEnableProfiler:
|
|||
@security_off_wrap
|
||||
def test_ascend_profiler(self):
|
||||
status = os.system(
|
||||
"""export MS_PROFILER_OPTIONS='{"start":true, "memory":true}';
|
||||
"""export MS_PROFILER_OPTIONS='{"start":true, "profile_memory":true}';
|
||||
python ./run_net.py --target=Ascend --mode=0;
|
||||
"""
|
||||
)
|
||||
|
|
Loading…
Reference in New Issue