!47494 环境变量使能profiler与profiler接口方式参数统一

Merge pull request !47494 from liuchuting/env_p
This commit is contained in:
i-robot 2023-01-06 03:31:32 +00:00 committed by Gitee
commit 5708472537
No known key found for this signature in database
GPG Key ID: 173E9B9CA92EEF8F
3 changed files with 101 additions and 123 deletions

View File

@ -18,6 +18,7 @@ import os
import time
from enum import Enum
from mindspore.profiler import Profiler
from mindspore.profiler.profiling import AICORE_METRICS_DICT
from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
from mindspore.profiler.parser.integrator import DeviceTarget
@ -27,9 +28,9 @@ from mindspore import log as logger, context
class DeviceSupportParam(Enum):
"""The device target enum."""
CPU = ['start', 'output_path']
GPU = ['start', 'output_path', 'data_process', 'timeline_limit_size', 'sync_enable']
ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit_size', 'memory', 'parallel_strategy', 'hccl',
'aicore_metrics', 'l2_cache']
GPU = ['start', 'output_path', 'data_process', 'timeline_limit', 'sync_enable']
ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit', 'profile_memory', 'parallel_strategy',
'profile_communication', 'aicore_metrics', 'l2_cache']
def get_profiling_options():
@ -68,7 +69,7 @@ def construct_profiling_options():
conbine_options["output_path"] = validate_and_normalize_path(output_path)
if not os.path.exists(output_path):
os.makedirs(output_path, exist_ok=True)
conbine_options["profiler_path"] = os.path.join(output_path, "profiler")
conbine_options["file_output_path"] = os.path.join(output_path, "profiler")
return conbine_options
@ -79,16 +80,18 @@ def parse_pubilc_args(options):
"The 'start' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
f" but got type {type(options.get('start'))}, it will be set to false.")
options["start"] = False
if not isinstance(options.get("data_process_enable"), bool):
if not options.get("start"):
return options
if not isinstance(options.get("data_process"), bool):
logger.warning(
"The 'data_process' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
f" but got type {type(options.get('data_process_enable'))}, it will be set to true.")
options["data_process_enable"] = True
if not isinstance(options.get("timeline_limit_size"), int):
f" but got type {type(options.get('data_process'))}, it will be set to true.")
options["data_process"] = True
if not isinstance(options.get("timeline_limit"), int):
logger.warning(
"The 'timeline_limit_size' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
f" but got type {type(options.get('timeline_limit_size'))}, it will be set to 500.")
options["timeline_limit_size"] = 500
"The 'timeline_limit' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
f" but got type {type(options.get('timeline_limit'))}, it will be set to 500.")
options["timeline_limit"] = 500
absolute_path = os.path.join(os.getcwd(), "data")
if not isinstance(options.get("output_path"), str):
logger.warning(
@ -117,23 +120,23 @@ def parse_ascend_args(options):
"""Parsing ascend profiling args."""
if not isinstance(options.get("profile_memory"), bool):
logger.warning(
"The 'memory' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
"The 'profile_memory' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
f" but got type {type(options.get('profile_memory'))}, it will be set to false.")
options["profile_memory"] = False
if not isinstance(options.get("parallel_strategy_enable"), bool):
if not isinstance(options.get("parallel_strategy"), bool):
logger.warning(
"The 'parallel_strategy' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
f" but got type {type(options.get('parallel_strategy_enable'))}, it will be set to true.")
options["parallel_strategy_enable"] = True
f" but got type {type(options.get('parallel_strategy'))}, it will be set to true.")
options["parallel_strategy"] = True
if not isinstance(options.get("profile_communication"), bool):
logger.warning(
"The 'hccl' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
"The 'profile_communication' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
f" but got type {type(options.get('profile_communication'))}, it will be set to false.")
options["profile_communication"] = False
if not isinstance(options.get("aicore_metrics"), int):
if options.get("aicore_metrics") not in AICORE_METRICS_DICT:
logger.warning(
"The 'aicore_metrics' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
f" but got type {type(options.get('aicore_metrics'))}, it will be set to 0.")
"The 'aicore_metrics' parameter of the environment variable MS_PROFILE_OPTIONS must be in "
f"[-1, 0, 1, 2, 3, 4, 5], but got {options.get('aicore_metrics')}, it will be set to 0.")
options["aicore_metrics"] = 0
if not isinstance(options.get("l2_cache"), bool):
logger.warning(
@ -146,6 +149,8 @@ def parse_ascend_args(options):
def parse_profiling_args(options):
"""Parsing profiling args."""
profiling_options = parse_pubilc_args(options)
if not profiling_options.get("start"):
return profiling_options
if context.get_context("device_target").lower() == DeviceTarget.ASCEND.value:
options = parse_ascend_args(profiling_options)
if context.get_context("device_target").lower() == DeviceTarget.GPU.value:
@ -157,20 +162,24 @@ def combine_profile_options(profiling_options):
"""Combined profiling options."""
factor_s_to_us = 1e7
output_path = os.path.join(os.getcwd(), "data")
if context.get_context("device_target").upper() == "GPU":
sync_enable = profiling_options.get("sync_enable", True)
else:
sync_enable = profiling_options.get("sync_enable", False)
options = {
"start": profiling_options.get('start', False),
"start_time": int(time.time() * factor_s_to_us),
"pid": os.getpid(),
"output_path": profiling_options.get('output_path', output_path),
"profiler_path": "",
"profile_memory": profiling_options.get("memory", False),
"profile_communication": profiling_options.get("hccl", False),
"file_output_path": "",
"profile_memory": profiling_options.get("profile_memory", False),
"profile_communication": profiling_options.get("profile_communication", False),
"aicore_metrics": profiling_options.get("aicore_metrics", 0),
"l2_cache": profiling_options.get("l2_cache", False),
"sync_enable": profiling_options.get("sync_enable", True),
"data_process_enable": profiling_options.get("data_process", True),
"timeline_limit_size": profiling_options.get("timeline_limit_size", 500),
"parallel_strategy_enable": profiling_options.get("parallel_strategy", True),
"sync_enable": sync_enable,
"data_process": profiling_options.get("data_process", True),
"timeline_limit": profiling_options.get("timeline_limit", 500),
"parallel_strategy": profiling_options.get("parallel_strategy", True),
}
return options
@ -179,50 +188,18 @@ class EnvProfiler:
"""Collect and analyze training performance data, support calls during and after training."""
def __init__(self):
self._profiling_options = ''
self._output_path = False
self.profile_memory = False
self.profile_communication = False
self.aicore_metrics = 0
self.l2_cache = False
self.sync_enable = True
self.start_time = 0
self.parallel_strategy_enable = True
self.timeline_limit_size = 500
self.data_process_enable = True
self._profiling_options = {}
def analyse(self):
"""Determine whether to stop collecting and parsing performance data based on environment variables."""
if not os.getenv("MS_PROFILER_OPTIONS"):
return
options = json.loads(os.getenv("MS_PROFILER_RUN_CONFIG", "{}"))
if not options.get("pid", 0) == os.getpid():
self._profiling_options = json.loads(os.getenv("MS_PROFILER_RUN_CONFIG", "{}"))
if not self._profiling_options.get("pid", 0) == os.getpid():
return
if not options.get("start"):
if not self._profiling_options.get("start"):
return
self._output_path = options.get("profiler_path")
self.profile_memory = options.get("profile_memory")
self.profile_communication = options.get("profile_communication")
self.aicore_metrics = options.get("aicore_metrics")
self.l2_cache = options.get("l2_cache")
self.sync_enable = options.get("sync_enable")
self.parallel_strategy_enable = options.get("parallel_strategy_enable")
self.timeline_limit_size = options.get("timeline_limit_size")
self.data_process_enable = options.get("data_process_enable")
self.start_time = options.get("start_time")
options = {
"output_path": self._output_path,
"profile_memory": self.profile_memory,
"profile_communication": self.profile_communication,
"aicore_metrics": self.aicore_metrics,
"l2_cache": self.l2_cache,
"start_time": self.start_time,
"sync_enable": self.sync_enable,
"parallel_strategy_enable": self.parallel_strategy_enable,
"timeline_limit_size": self.timeline_limit_size,
"data_process_enable": self.data_process_enable
}
profiler = Profiler(env_enable=options)
profiler = Profiler(env_enable=self._profiling_options)
profiler.analyse()
@ -233,18 +210,18 @@ def profiler_check_env():
if os.getenv("MS_PROFILER_RUN_CONFIG"):
return
config = construct_profiling_options()
os.environ["MS_PROFILER_RUN_CONFIG"] = json.dumps(config)
if not config.get("start"):
return
os.environ["MS_PROFILER_RUN_CONFIG"] = json.dumps(config)
Profiler(output_path=config.get("output_path"),
profile_memory=config.get("profile_memory"),
profile_communication=config.get("profile_communication"),
data_process_enable=config.get("data_process_enable"),
parallel_strategy_enable=config.get("parallel_strategy_enable"),
data_process=config.get("data_process"),
parallel_strategy=config.get("parallel_strategy"),
aicore_metrics=config.get("aicore_metrics"),
l2_cache=config.get("l2_cache"),
sync_enable=config.get("sync_enable"),
timeline_limit_size=config.get("timeline_limit_size"))
timeline_limit=config.get("timeline_limit"))
profiler_check_env()

View File

@ -51,6 +51,16 @@ from mindspore.profiler.parser.profiler_info import ProfilerInfo
INIT_OP_NAME = 'Default/InitDataSetQueue'
AICORE_METRICS_DICT = {
0: "ArithmeticUtilization",
1: "PipeUtilization",
2: "Memory",
3: "MemoryL0",
4: "ResourceConflictRatio",
5: "MemoryUB",
-1: "None"
}
def _environment_check():
if c_expression.security.enable_security():
@ -153,15 +163,6 @@ class Profiler:
_has_initialized = False
_ascend_profiling_options = ""
_ascend_job_id = ""
_aicore_metrics_dict = {
0: "ArithmeticUtilization",
1: "PipeUtilization",
2: "Memory",
3: "MemoryL0",
4: "ResourceConflictRatio",
5: "MemoryUB",
-1: "None"
}
def __init__(self, **kwargs):
self._msprof_enable = os.getenv("PROFILER_SAMPLECONFIG")
@ -195,7 +196,7 @@ class Profiler:
# default aicore_metrics type is ArithmeticUtilization
self._aicore_metrics_id = 0
self._l2_cache = "off"
self._data_process_enable = True
self._data_process = True
self._parser_kwargs(kwargs)
# get device_id and device_target
self._get_devid_rankid_and_devtarget()
@ -399,11 +400,11 @@ class Profiler:
self._cpu_profiler.step_profiling_enable(True)
if self._device_target and self._device_target == DeviceTarget.GPU.value:
if self._data_process_enable:
if self._data_process:
self._md_profiler.start()
self._gpu_profiler.step_profiling_enable(True)
elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
if self._data_process_enable:
if self._data_process:
self._md_profiler.start()
self._ascend_graph_start()
ProfilerInfo.set_profiling_start_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
@ -449,7 +450,7 @@ class Profiler:
# No need to stop anything if parse profiling data offline
if self._is_offline_parser():
return
if self._data_process_enable:
if self._data_process:
self._md_profiler.stop()
self._md_profiler.save(self._output_path)
@ -471,16 +472,16 @@ class Profiler:
self._is_heterogeneous = False
self._rank_size = 1
self._start_time = options.get("start_time")
self._output_path = options.get('output_path')
self._output_path = options.get('file_output_path')
self._profile_memory = options.get('profile_memory')
self._parallel_strategy_enable = options.get('parallel_strategy_enable')
self._timeline_size_limit_byte = options.get('timeline_limit_size')
self._data_process_enable = options.get('data_process_enable')
self._parallel_strategy = options.get('parallel_strategy')
self._timeline_size_limit_byte = options.get('timeline_limit')
self._data_process = options.get('data_process')
self._profile_communication = options.get('profile_communication')
self._device_target = context.get_context("device_target").lower()
self._profiler_manager = c_expression.ProfilerManager.get_instance()
self._cpu_profiler = c_expression.Profiler.get_instance("CPU")
if self._data_process_enable:
if self._data_process:
self._md_profiler = cde.GlobalContext.profiling_manager()
if self._device_target == DeviceTarget.GPU.value:
self._gpu_profiler = c_expression.Profiler.get_instance("GPU")
@ -532,7 +533,7 @@ class Profiler:
def _gpu_profiler_init(self, kwargs):
"""Gpu profiler init."""
# Setup and start MindData Profiling
if self._data_process_enable:
if self._data_process:
self._md_profiler = cde.GlobalContext.profiling_manager()
self._md_profiler.init()
self._parse_parameter_for_gpu(kwargs)
@ -549,7 +550,7 @@ class Profiler:
def _ascend_profiler_init(self, kwargs):
"""Ascend profiler init."""
# Setup and start MindData Profiling
if self._data_process_enable:
if self._data_process:
self._md_profiler = cde.GlobalContext.profiling_manager()
self._md_profiler.init()
self._init_time = int(time.time() * 10000000)
@ -594,12 +595,12 @@ class Profiler:
"bp_point": bp_point,
"training_trace": "on",
"task_trace": "on",
"aic_metrics": Profiler._aicore_metrics_dict.get(self._aicore_metrics_id, "ArithmeticUtilization"),
"aic_metrics": AICORE_METRICS_DICT.get(self._aicore_metrics_id, "ArithmeticUtilization"),
"aicpu": "on",
"profile_memory": profile_memory,
"hccl": profiler_communication,
"l2_cache": self._l2_cache,
"parallel_strategy": "on" if self._parallel_strategy_enable else "off",
"parallel_strategy": "on" if self._parallel_strategy else "off",
}
return profiling_options
@ -650,9 +651,31 @@ class Profiler:
raise TypeError(f"For '{self.__class__.__name__}', the parameter profile_memory must be bool, "
f"but got type '{type(self._profile_memory)}'")
self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
if not isinstance(self._aicore_metrics_id, int):
raise TypeError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
f"but got type {type(self._aicore_metrics_id)}")
if self._aicore_metrics_id not in AICORE_METRICS_DICT:
raise ValueError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}")
l2_cache_enable = kwargs.pop("l2_cache", False)
if not isinstance(l2_cache_enable, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
f"but got type {type(l2_cache_enable)}")
if l2_cache_enable:
self._l2_cache = "on"
else:
self._l2_cache = "off"
self._parallel_strategy = kwargs.pop("parallel_strategy", True)
if not isinstance(self._parallel_strategy, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter parallel_strategy must be bool, "
f"but got type {type(self._parallel_strategy)}")
self._sync_enable = kwargs.pop("sync_enable", False)
if self._sync_enable:
logger.warning(f"The parameter sync_enabl is not supported on Ascend currently.")
logger.warning(f"The parameter sync_enable is not supported on Ascend currently.")
if kwargs:
logger.warning("%s are invalid params which don't work.", kwargs)
@ -1312,38 +1335,16 @@ class Profiler:
def _parser_kwargs(self, kwargs):
"""Parse kwargs vale."""
self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
if not isinstance(self._aicore_metrics_id, int):
raise TypeError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
f"but got type {type(self._aicore_metrics_id)}")
if self._aicore_metrics_id not in self._aicore_metrics_dict:
raise ValueError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}")
self._data_process = kwargs.pop("data_process", True)
if not isinstance(self._data_process, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process must be bool, "
f"but got type {type(self._data_process)}")
l2_cache_enable = kwargs.pop("l2_cache", False)
if not isinstance(l2_cache_enable, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
f"but got type {type(l2_cache_enable)}")
if l2_cache_enable:
self._l2_cache = "on"
else:
self._l2_cache = "off"
self._data_process_enable = kwargs.pop("data_process_enable", True)
if not isinstance(self._data_process_enable, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process_enable must be bool, "
f"but got type {type(self.data_process_enable)}")
timeline_limit_size = kwargs.pop("timeline_limit_size", 500)
if not isinstance(timeline_limit_size, int):
raise TypeError(f"For '{self.__class__.__name__}', the parameter timeline_limit_size must be int, "
f"but got type {type(self.timeline_limit_size)}")
self._timeline_size_limit_byte = timeline_limit_size * 1024 * 1024
self._parallel_strategy_enable = kwargs.pop("parallel_strategy_enable", True)
if not isinstance(self._parallel_strategy_enable, bool):
raise TypeError(f"For '{self.__class__.__name__}', the parameter parallel_strategy_enable must be bool, "
f"but got type {type(self._parallel_strategy_enable)}")
timeline_limit = kwargs.pop("timeline_limit", 500)
if not isinstance(timeline_limit, int):
raise TypeError(f"For '{self.__class__.__name__}', the parameter timeline_limit must be int, "
f"but got type {type(timeline_limit)}")
self._timeline_size_limit_byte = timeline_limit * 1024 * 1024
def _analyse_hccl_info(self):
"""Analyse hccl info."""

View File

@ -122,7 +122,7 @@ class TestEnvEnableProfiler:
@security_off_wrap
def test_gpu_profiler(self):
status = os.system(
"""export MS_PROFILER_OPTIONS='{"start":true, "memory":true, "sync_enable":true}';
"""export MS_PROFILER_OPTIONS='{"start":true, "profile_memory":true, "sync_enable":true}';
python ./run_net.py --target=GPU --mode=0;
"""
)
@ -154,7 +154,7 @@ class TestEnvEnableProfiler:
@security_off_wrap
def test_ascend_profiler(self):
status = os.system(
"""export MS_PROFILER_OPTIONS='{"start":true, "memory":true}';
"""export MS_PROFILER_OPTIONS='{"start":true, "profile_memory":true}';
python ./run_net.py --target=Ascend --mode=0;
"""
)