!47494 环境变量使能profiler与profiler接口方式参数统一

Merge pull request !47494 from liuchuting/env_p
2023-01-06 03:31:32 +00:00 · 2023-01-06 03:31:32 +00:00 · 5708472537
parent a127654425 ae409f5f60
commit 5708472537
3 changed files with 101 additions and 123 deletions
--- a/mindspore/python/mindspore/profiler/envprofiling.py
+++ b/mindspore/python/mindspore/profiler/envprofiling.py
@ -18,6 +18,7 @@ import os
 import time
 from enum import Enum
 from mindspore.profiler import Profiler
+from mindspore.profiler.profiling import AICORE_METRICS_DICT
 from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
 from mindspore.profiler.parser.integrator import DeviceTarget

@ -27,9 +28,9 @@ from mindspore import log as logger, context
 class DeviceSupportParam(Enum):
    """The device target enum."""
    CPU = ['start', 'output_path']
-    GPU = ['start', 'output_path', 'data_process', 'timeline_limit_size', 'sync_enable']
-    ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit_size', 'memory', 'parallel_strategy', 'hccl',
-              'aicore_metrics', 'l2_cache']
+    GPU = ['start', 'output_path', 'data_process', 'timeline_limit', 'sync_enable']
+    ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit', 'profile_memory', 'parallel_strategy',
+              'profile_communication', 'aicore_metrics', 'l2_cache']


 def get_profiling_options():
@ -68,7 +69,7 @@ def construct_profiling_options():
        conbine_options["output_path"] = validate_and_normalize_path(output_path)
        if not os.path.exists(output_path):
            os.makedirs(output_path, exist_ok=True)
-        conbine_options["profiler_path"] = os.path.join(output_path, "profiler")
+        conbine_options["file_output_path"] = os.path.join(output_path, "profiler")
    return conbine_options


@ -79,16 +80,18 @@ def parse_pubilc_args(options):
            "The 'start' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
            f" but got type {type(options.get('start'))}, it will be set to false.")
        options["start"] = False
-    if not isinstance(options.get("data_process_enable"), bool):
+    if not options.get("start"):
+        return options
+    if not isinstance(options.get("data_process"), bool):
        logger.warning(
            "The 'data_process' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
-            f" but got type {type(options.get('data_process_enable'))}, it will be set to true.")
-        options["data_process_enable"] = True
-    if not isinstance(options.get("timeline_limit_size"), int):
+            f" but got type {type(options.get('data_process'))}, it will be set to true.")
+        options["data_process"] = True
+    if not isinstance(options.get("timeline_limit"), int):
        logger.warning(
-            "The 'timeline_limit_size' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
-            f" but got type {type(options.get('timeline_limit_size'))}, it will be set to 500.")
-        options["timeline_limit_size"] = 500
+            "The 'timeline_limit' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
+            f" but got type {type(options.get('timeline_limit'))}, it will be set to 500.")
+        options["timeline_limit"] = 500
    absolute_path = os.path.join(os.getcwd(), "data")
    if not isinstance(options.get("output_path"), str):
        logger.warning(
@ -117,23 +120,23 @@ def parse_ascend_args(options):
    """Parsing ascend profiling args."""
    if not isinstance(options.get("profile_memory"), bool):
        logger.warning(
-            "The 'memory' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
+            "The 'profile_memory' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
            f" but got type {type(options.get('profile_memory'))}, it will be set to false.")
        options["profile_memory"] = False
-    if not isinstance(options.get("parallel_strategy_enable"), bool):
+    if not isinstance(options.get("parallel_strategy"), bool):
        logger.warning(
            "The 'parallel_strategy' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
-            f" but got type {type(options.get('parallel_strategy_enable'))}, it will be set to true.")
-        options["parallel_strategy_enable"] = True
+            f" but got type {type(options.get('parallel_strategy'))}, it will be set to true.")
+        options["parallel_strategy"] = True
    if not isinstance(options.get("profile_communication"), bool):
        logger.warning(
-            "The 'hccl' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
+            "The 'profile_communication' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
            f" but got type {type(options.get('profile_communication'))}, it will be set to false.")
        options["profile_communication"] = False
-    if not isinstance(options.get("aicore_metrics"), int):
+    if options.get("aicore_metrics") not in AICORE_METRICS_DICT:
        logger.warning(
-            "The 'aicore_metrics' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
-            f" but got type {type(options.get('aicore_metrics'))}, it will be set to 0.")
+            "The 'aicore_metrics' parameter of the environment variable MS_PROFILE_OPTIONS must be in "
+            f"[-1, 0, 1, 2, 3, 4, 5], but got {options.get('aicore_metrics')}, it will be set to 0.")
        options["aicore_metrics"] = 0
    if not isinstance(options.get("l2_cache"), bool):
        logger.warning(
@ -146,6 +149,8 @@ def parse_ascend_args(options):
 def parse_profiling_args(options):
    """Parsing profiling args."""
    profiling_options = parse_pubilc_args(options)
+    if not profiling_options.get("start"):
+        return profiling_options
    if context.get_context("device_target").lower() == DeviceTarget.ASCEND.value:
        options = parse_ascend_args(profiling_options)
    if context.get_context("device_target").lower() == DeviceTarget.GPU.value:
@ -157,20 +162,24 @@ def combine_profile_options(profiling_options):
    """Combined profiling options."""
    factor_s_to_us = 1e7
    output_path = os.path.join(os.getcwd(), "data")
+    if context.get_context("device_target").upper() == "GPU":
+        sync_enable = profiling_options.get("sync_enable", True)
+    else:
+        sync_enable = profiling_options.get("sync_enable", False)
    options = {
        "start": profiling_options.get('start', False),
        "start_time": int(time.time() * factor_s_to_us),
        "pid": os.getpid(),
        "output_path": profiling_options.get('output_path', output_path),
-        "profiler_path": "",
-        "profile_memory": profiling_options.get("memory", False),
-        "profile_communication": profiling_options.get("hccl", False),
+        "file_output_path": "",
+        "profile_memory": profiling_options.get("profile_memory", False),
+        "profile_communication": profiling_options.get("profile_communication", False),
        "aicore_metrics": profiling_options.get("aicore_metrics", 0),
        "l2_cache": profiling_options.get("l2_cache", False),
-        "sync_enable": profiling_options.get("sync_enable", True),
-        "data_process_enable": profiling_options.get("data_process", True),
-        "timeline_limit_size": profiling_options.get("timeline_limit_size", 500),
-        "parallel_strategy_enable": profiling_options.get("parallel_strategy", True),
+        "sync_enable": sync_enable,
+        "data_process": profiling_options.get("data_process", True),
+        "timeline_limit": profiling_options.get("timeline_limit", 500),
+        "parallel_strategy": profiling_options.get("parallel_strategy", True),
    }
    return options

@ -179,50 +188,18 @@ class EnvProfiler:
    """Collect and analyze training performance data, support calls during and after training."""

    def __init__(self):
-        self._profiling_options = ''
-        self._output_path = False
-        self.profile_memory = False
-        self.profile_communication = False
-        self.aicore_metrics = 0
-        self.l2_cache = False
-        self.sync_enable = True
-        self.start_time = 0
-        self.parallel_strategy_enable = True
-        self.timeline_limit_size = 500
-        self.data_process_enable = True
+        self._profiling_options = {}

    def analyse(self):
        """Determine whether to stop collecting and parsing performance data based on environment variables."""
        if not os.getenv("MS_PROFILER_OPTIONS"):
            return
-        options = json.loads(os.getenv("MS_PROFILER_RUN_CONFIG", "{}"))
-        if not options.get("pid", 0) == os.getpid():
+        self._profiling_options = json.loads(os.getenv("MS_PROFILER_RUN_CONFIG", "{}"))
+        if not self._profiling_options.get("pid", 0) == os.getpid():
            return
-        if not options.get("start"):
+        if not self._profiling_options.get("start"):
            return
-        self._output_path = options.get("profiler_path")
-        self.profile_memory = options.get("profile_memory")
-        self.profile_communication = options.get("profile_communication")
-        self.aicore_metrics = options.get("aicore_metrics")
-        self.l2_cache = options.get("l2_cache")
-        self.sync_enable = options.get("sync_enable")
-        self.parallel_strategy_enable = options.get("parallel_strategy_enable")
-        self.timeline_limit_size = options.get("timeline_limit_size")
-        self.data_process_enable = options.get("data_process_enable")
-        self.start_time = options.get("start_time")
-        options = {
-            "output_path": self._output_path,
-            "profile_memory": self.profile_memory,
-            "profile_communication": self.profile_communication,
-            "aicore_metrics": self.aicore_metrics,
-            "l2_cache": self.l2_cache,
-            "start_time": self.start_time,
-            "sync_enable": self.sync_enable,
-            "parallel_strategy_enable": self.parallel_strategy_enable,
-            "timeline_limit_size": self.timeline_limit_size,
-            "data_process_enable": self.data_process_enable
-        }
-        profiler = Profiler(env_enable=options)
+        profiler = Profiler(env_enable=self._profiling_options)
        profiler.analyse()


@ -233,18 +210,18 @@ def profiler_check_env():
    if os.getenv("MS_PROFILER_RUN_CONFIG"):
        return
    config = construct_profiling_options()
+    os.environ["MS_PROFILER_RUN_CONFIG"] = json.dumps(config)
    if not config.get("start"):
        return
-    os.environ["MS_PROFILER_RUN_CONFIG"] = json.dumps(config)
    Profiler(output_path=config.get("output_path"),
             profile_memory=config.get("profile_memory"),
             profile_communication=config.get("profile_communication"),
-             data_process_enable=config.get("data_process_enable"),
-             parallel_strategy_enable=config.get("parallel_strategy_enable"),
+             data_process=config.get("data_process"),
+             parallel_strategy=config.get("parallel_strategy"),
             aicore_metrics=config.get("aicore_metrics"),
             l2_cache=config.get("l2_cache"),
             sync_enable=config.get("sync_enable"),
-             timeline_limit_size=config.get("timeline_limit_size"))
+             timeline_limit=config.get("timeline_limit"))


 profiler_check_env()
--- a/mindspore/python/mindspore/profiler/profiling.py
+++ b/mindspore/python/mindspore/profiler/profiling.py
@ -51,6 +51,16 @@ from mindspore.profiler.parser.profiler_info import ProfilerInfo

 INIT_OP_NAME = 'Default/InitDataSetQueue'

+AICORE_METRICS_DICT = {
+    0: "ArithmeticUtilization",
+    1: "PipeUtilization",
+    2: "Memory",
+    3: "MemoryL0",
+    4: "ResourceConflictRatio",
+    5: "MemoryUB",
+    -1: "None"
+}
+

 def _environment_check():
    if c_expression.security.enable_security():
@ -153,15 +163,6 @@ class Profiler:
    _has_initialized = False
    _ascend_profiling_options = ""
    _ascend_job_id = ""
-    _aicore_metrics_dict = {
-        0: "ArithmeticUtilization",
-        1: "PipeUtilization",
-        2: "Memory",
-        3: "MemoryL0",
-        4: "ResourceConflictRatio",
-        5: "MemoryUB",
-        -1: "None"
-    }

    def __init__(self, **kwargs):
        self._msprof_enable = os.getenv("PROFILER_SAMPLECONFIG")
@ -195,7 +196,7 @@ class Profiler:
        # default aicore_metrics type is ArithmeticUtilization
        self._aicore_metrics_id = 0
        self._l2_cache = "off"
-        self._data_process_enable = True
+        self._data_process = True
        self._parser_kwargs(kwargs)
        # get device_id and device_target
        self._get_devid_rankid_and_devtarget()
@ -399,11 +400,11 @@ class Profiler:
        self._cpu_profiler.step_profiling_enable(True)

        if self._device_target and self._device_target == DeviceTarget.GPU.value:
-            if self._data_process_enable:
+            if self._data_process:
                self._md_profiler.start()
            self._gpu_profiler.step_profiling_enable(True)
        elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
-            if self._data_process_enable:
+            if self._data_process:
                self._md_profiler.start()
            self._ascend_graph_start()
        ProfilerInfo.set_profiling_start_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
@ -449,7 +450,7 @@ class Profiler:
        # No need to stop anything if parse profiling data offline
        if self._is_offline_parser():
            return
-        if self._data_process_enable:
+        if self._data_process:
            self._md_profiler.stop()
            self._md_profiler.save(self._output_path)

@ -471,16 +472,16 @@ class Profiler:
        self._is_heterogeneous = False
        self._rank_size = 1
        self._start_time = options.get("start_time")
-        self._output_path = options.get('output_path')
+        self._output_path = options.get('file_output_path')
        self._profile_memory = options.get('profile_memory')
-        self._parallel_strategy_enable = options.get('parallel_strategy_enable')
-        self._timeline_size_limit_byte = options.get('timeline_limit_size')
-        self._data_process_enable = options.get('data_process_enable')
+        self._parallel_strategy = options.get('parallel_strategy')
+        self._timeline_size_limit_byte = options.get('timeline_limit')
+        self._data_process = options.get('data_process')
        self._profile_communication = options.get('profile_communication')
        self._device_target = context.get_context("device_target").lower()
        self._profiler_manager = c_expression.ProfilerManager.get_instance()
        self._cpu_profiler = c_expression.Profiler.get_instance("CPU")
-        if self._data_process_enable:
+        if self._data_process:
            self._md_profiler = cde.GlobalContext.profiling_manager()
        if self._device_target == DeviceTarget.GPU.value:
            self._gpu_profiler = c_expression.Profiler.get_instance("GPU")
@ -532,7 +533,7 @@ class Profiler:
    def _gpu_profiler_init(self, kwargs):
        """Gpu profiler init."""
        # Setup and start MindData Profiling
-        if self._data_process_enable:
+        if self._data_process:
            self._md_profiler = cde.GlobalContext.profiling_manager()
            self._md_profiler.init()
        self._parse_parameter_for_gpu(kwargs)
@ -549,7 +550,7 @@ class Profiler:
    def _ascend_profiler_init(self, kwargs):
        """Ascend profiler init."""
        # Setup and start MindData Profiling
-        if self._data_process_enable:
+        if self._data_process:
            self._md_profiler = cde.GlobalContext.profiling_manager()
            self._md_profiler.init()
        self._init_time = int(time.time() * 10000000)
@ -594,12 +595,12 @@ class Profiler:
            "bp_point": bp_point,
            "training_trace": "on",
            "task_trace": "on",
-            "aic_metrics": Profiler._aicore_metrics_dict.get(self._aicore_metrics_id, "ArithmeticUtilization"),
+            "aic_metrics": AICORE_METRICS_DICT.get(self._aicore_metrics_id, "ArithmeticUtilization"),
            "aicpu": "on",
            "profile_memory": profile_memory,
            "hccl": profiler_communication,
            "l2_cache": self._l2_cache,
-            "parallel_strategy": "on" if self._parallel_strategy_enable else "off",
+            "parallel_strategy": "on" if self._parallel_strategy else "off",
        }

        return profiling_options
@ -650,9 +651,31 @@ class Profiler:
            raise TypeError(f"For '{self.__class__.__name__}', the parameter profile_memory must be bool, "
                            f"but got type '{type(self._profile_memory)}'")

+        self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
+        if not isinstance(self._aicore_metrics_id, int):
+            raise TypeError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
+                            f"but got type {type(self._aicore_metrics_id)}")
+        if self._aicore_metrics_id not in AICORE_METRICS_DICT:
+            raise ValueError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
+                             f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}")
+
+        l2_cache_enable = kwargs.pop("l2_cache", False)
+        if not isinstance(l2_cache_enable, bool):
+            raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
+                            f"but got type {type(l2_cache_enable)}")
+        if l2_cache_enable:
+            self._l2_cache = "on"
+        else:
+            self._l2_cache = "off"
+
+        self._parallel_strategy = kwargs.pop("parallel_strategy", True)
+        if not isinstance(self._parallel_strategy, bool):
+            raise TypeError(f"For '{self.__class__.__name__}', the parameter parallel_strategy must be bool, "
+                            f"but got type {type(self._parallel_strategy)}")
+
        self._sync_enable = kwargs.pop("sync_enable", False)
        if self._sync_enable:
-            logger.warning(f"The parameter sync_enabl is not supported on Ascend currently.")
+            logger.warning(f"The parameter sync_enable is not supported on Ascend currently.")

        if kwargs:
            logger.warning("%s are invalid params which don't work.", kwargs)
@ -1312,38 +1335,16 @@ class Profiler:

    def _parser_kwargs(self, kwargs):
        """Parse kwargs vale."""
-        self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
-        if not isinstance(self._aicore_metrics_id, int):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
-                            f"but got type {type(self._aicore_metrics_id)}")
-        if self._aicore_metrics_id not in self._aicore_metrics_dict:
-            raise ValueError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
-                             f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}")
+        self._data_process = kwargs.pop("data_process", True)
+        if not isinstance(self._data_process, bool):
+            raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process must be bool, "
+                            f"but got type {type(self._data_process)}")

-        l2_cache_enable = kwargs.pop("l2_cache", False)
-        if not isinstance(l2_cache_enable, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
-                            f"but got type {type(l2_cache_enable)}")
-        if l2_cache_enable:
-            self._l2_cache = "on"
-        else:
-            self._l2_cache = "off"
-
-        self._data_process_enable = kwargs.pop("data_process_enable", True)
-        if not isinstance(self._data_process_enable, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process_enable must be bool, "
-                            f"but got type {type(self.data_process_enable)}")
-
-        timeline_limit_size = kwargs.pop("timeline_limit_size", 500)
-        if not isinstance(timeline_limit_size, int):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter timeline_limit_size must be int, "
-                            f"but got type {type(self.timeline_limit_size)}")
-        self._timeline_size_limit_byte = timeline_limit_size * 1024 * 1024
-
-        self._parallel_strategy_enable = kwargs.pop("parallel_strategy_enable", True)
-        if not isinstance(self._parallel_strategy_enable, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter parallel_strategy_enable must be bool, "
-                            f"but got type {type(self._parallel_strategy_enable)}")
+        timeline_limit = kwargs.pop("timeline_limit", 500)
+        if not isinstance(timeline_limit, int):
+            raise TypeError(f"For '{self.__class__.__name__}', the parameter timeline_limit must be int, "
+                            f"but got type {type(timeline_limit)}")
+        self._timeline_size_limit_byte = timeline_limit * 1024 * 1024

    def _analyse_hccl_info(self):
        """Analyse hccl info."""
--- a/tests/st/profiler/test_env_enable_profiler.py
+++ b/tests/st/profiler/test_env_enable_profiler.py
@ -122,7 +122,7 @@ class TestEnvEnableProfiler:
    @security_off_wrap
    def test_gpu_profiler(self):
        status = os.system(
-            """export MS_PROFILER_OPTIONS='{"start":true, "memory":true, "sync_enable":true}';
+            """export MS_PROFILER_OPTIONS='{"start":true, "profile_memory":true, "sync_enable":true}';
               python ./run_net.py --target=GPU --mode=0;
            """
        )
@ -154,7 +154,7 @@ class TestEnvEnableProfiler:
    @security_off_wrap
    def test_ascend_profiler(self):
        status = os.system(
-            """export MS_PROFILER_OPTIONS='{"start":true, "memory":true}';
+            """export MS_PROFILER_OPTIONS='{"start":true, "profile_memory":true}';
               python ./run_net.py --target=Ascend --mode=0;
            """
        )