From 69b14bb126dfcf58706df24bb6692632b70d40ca Mon Sep 17 00:00:00 2001
From: liuchuting <liuchuting1@huawei.com>
Date: Mon, 6 Feb 2023 16:31:02 +0800
Subject: [PATCH] It is not appropriate to enable profiler warning scenarios by
 environment variables

---
 .../python/mindspore/profiler/envprofiling.py | 88 +++++++++++--------
 .../profiler/parser/minddata_analyzer.py      |  2 +-
 .../python/mindspore/profiler/profiling.py    | 86 +++++++++++-------
 3 files changed, 106 insertions(+), 70 deletions(-)

diff --git a/mindspore/python/mindspore/profiler/envprofiling.py b/mindspore/python/mindspore/profiler/envprofiling.py
index ef1fda805f0..25a1140dc52 100644
--- a/mindspore/python/mindspore/profiler/envprofiling.py
+++ b/mindspore/python/mindspore/profiler/envprofiling.py
@@ -16,23 +16,14 @@
 import json
 import os
 import time
-from enum import Enum
 from mindspore.profiler import Profiler
-from mindspore.profiler.profiling import AICORE_METRICS_DICT
+from mindspore.profiler.profiling import AICORE_METRICS_DICT, DeviceSupportParam
 from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
 from mindspore.profiler.parser.integrator import DeviceTarget
 
 from mindspore import log as logger, context
 
 
-class DeviceSupportParam(Enum):
-    """The device target enum."""
-    CPU = ['start', 'output_path']
-    GPU = ['start', 'output_path', 'data_process', 'timeline_limit', 'sync_enable']
-    ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit', 'profile_memory', 'parallel_strategy',
-              'profile_communication', 'aicore_metrics', 'l2_cache']
-
-
 def get_profiling_options():
     """Get profiling options."""
     try:
@@ -42,27 +33,41 @@ def get_profiling_options():
     return options
 
 
-def parse_device_support_param(options):
+def parse_device_support_param(origin_options, final_options, factor_s_to_us=1e7):
     """Parse platform support parameters."""
     device_target = context.get_context("device_target").upper()
-    for param in options.keys():
-        if param not in DeviceSupportParam.__getattr__(f'{device_target}').value:
-            logger.warning(f"The parameter '{param}' is not supported on {device_target} currently.")
+    support_list = DeviceSupportParam.__getattr__(f'{device_target}').value
+    support_dict = final_options.copy()
+    for param in list(set(origin_options) | set(final_options)):
+        if param not in support_list and origin_options.get(param):
+            logger.warning(f"[Profiler]'{param}' is invalid params on this platform.")
+        if param not in support_list and final_options.get(param):
+            support_dict.pop(param)
+    simple_options = {
+        "start_time": int(time.time() * factor_s_to_us),
+        "file_output_path": "",
+        "pid": os.getpid(),
+    }
+    support_dict.update(simple_options)
+    return support_dict
 
 
 def construct_profiling_options():
     """Construct profiling options to determine which profiling data should be collected."""
     profiling_options = get_profiling_options()
     if profiling_options is None:
-        raise RuntimeError(
+        error_config = {"start": False}
+        if os.getenv("MS_PROFILER_RUN_CONFIG"):
+            return error_config
+        os.environ["MS_PROFILER_RUN_CONFIG"] = json.dumps(error_config)
+        logger.error(
             "The format of MS_PROFILER_OPTIONS is incorrect. "
             "The MS_PROFILER_OPTIONS parameter configuration may refer to "
             "'https://www.mindspore.cn/mindinsight/docs/zh-CN/master/performance_profiling_ascend.html'."
         )
-    options = combine_profile_options(profiling_options)
-    conbine_options = parse_profiling_args(options)
+        return error_config
+    conbine_options = combine_profile_options(profiling_options)
     if conbine_options.get("start"):
-        parse_device_support_param(profiling_options)
         output_path = conbine_options.get("output_path")
         if not output_path:
             output_path = os.path.join(os.getcwd(), "data")
@@ -87,11 +92,20 @@ def parse_pubilc_args(options):
             "The 'data_process' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
             f" but got type {type(options.get('data_process'))}, it will be set to true.")
         options["data_process"] = True
-    if not isinstance(options.get("timeline_limit"), int):
+    if not isinstance(options.get("op_time"), bool):
+        logger.warning(
+            "The 'op_time' parameter of the environment variable MS_PROFILE_OPTIONS must be bool,"
+            f" but got type {type(options.get('op_time'))}, it will be set to true.")
+        options["op_time"] = True
+    if isinstance(options.get("timeline_limit"), bool) or not isinstance(options.get("timeline_limit"), int):
         logger.warning(
             "The 'timeline_limit' parameter of the environment variable MS_PROFILE_OPTIONS must be int,"
             f" but got type {type(options.get('timeline_limit'))}, it will be set to 500.")
         options["timeline_limit"] = 500
+    if options.get('timeline_limit') <= 0:
+        logger.warning(
+            "The 'timeline_limit' parameter of the environment variable MS_PROFILE_OPTIONS must be greater than 0.")
+        options["timeline_limit"] = 500
     absolute_path = os.path.join(os.getcwd(), "data")
     if not isinstance(options.get("output_path"), str):
         logger.warning(
@@ -160,28 +174,25 @@ def parse_profiling_args(options):
 
 def combine_profile_options(profiling_options):
     """Combined profiling options."""
-    factor_s_to_us = 1e7
     output_path = os.path.join(os.getcwd(), "data")
-    if context.get_context("device_target").upper() == "GPU":
-        sync_enable = profiling_options.get("sync_enable", True)
-    else:
-        sync_enable = profiling_options.get("sync_enable", False)
-    options = {
+    config_options = {
         "start": profiling_options.get('start', False),
-        "start_time": int(time.time() * factor_s_to_us),
-        "pid": os.getpid(),
         "output_path": profiling_options.get('output_path', output_path),
-        "file_output_path": "",
         "profile_memory": profiling_options.get("profile_memory", False),
         "profile_communication": profiling_options.get("profile_communication", False),
         "aicore_metrics": profiling_options.get("aicore_metrics", 0),
         "l2_cache": profiling_options.get("l2_cache", False),
-        "sync_enable": sync_enable,
+        "sync_enable": profiling_options.get("sync_enable", True),
         "data_process": profiling_options.get("data_process", True),
         "timeline_limit": profiling_options.get("timeline_limit", 500),
         "parallel_strategy": profiling_options.get("parallel_strategy", True),
+        'op_time': profiling_options.get("op_time", True)
     }
-    return options
+    combine_options = parse_profiling_args(config_options)
+    if combine_options.get("start"):
+        final_options = parse_device_support_param(profiling_options, combine_options)
+        return final_options
+    return combine_options
 
 
 class EnvProfiler:
@@ -214,14 +225,15 @@ def profiler_check_env():
     if not config.get("start"):
         return
     Profiler(output_path=config.get("output_path"),
-             profile_memory=config.get("profile_memory"),
-             profile_communication=config.get("profile_communication"),
-             data_process=config.get("data_process"),
-             parallel_strategy=config.get("parallel_strategy"),
-             aicore_metrics=config.get("aicore_metrics"),
-             l2_cache=config.get("l2_cache"),
-             sync_enable=config.get("sync_enable"),
-             timeline_limit=config.get("timeline_limit"))
+             profile_memory=config.get("profile_memory", False),
+             profile_communication=config.get("profile_communication", False),
+             data_process=config.get("data_process", False),
+             parallel_strategy=config.get("parallel_strategy", False),
+             aicore_metrics=config.get("aicore_metrics", 0),
+             l2_cache=config.get("l2_cache", False),
+             sync_enable=config.get("sync_enable", False),
+             op_time=config.get("op_time", False),
+             timeline_limit=config.get("timeline_limit", 500))
 
 
 profiler_check_env()
diff --git a/mindspore/python/mindspore/profiler/parser/minddata_analyzer.py b/mindspore/python/mindspore/profiler/parser/minddata_analyzer.py
index 4150a3ce683..b775fe51c79 100644
--- a/mindspore/python/mindspore/profiler/parser/minddata_analyzer.py
+++ b/mindspore/python/mindspore/profiler/parser/minddata_analyzer.py
@@ -78,7 +78,7 @@ class MinddataProfilingAnalyzer:
         try:
             validated_dir = validate_and_normalize_path(dir_name)
         except RuntimeError as path_error:
-            logger.warning('<%s> <%s> is invalid.', dir_type, validated_dir)
+            logger.warning('<%s> is invalid.', dir_type)
             raise ProfilerPathErrorException(dir_type + 'is invalid.') from path_error
 
         if not os.path.isdir(validated_dir):
diff --git a/mindspore/python/mindspore/profiler/profiling.py b/mindspore/python/mindspore/profiler/profiling.py
index 073a276f94b..570f5ded627 100644
--- a/mindspore/python/mindspore/profiler/profiling.py
+++ b/mindspore/python/mindspore/profiler/profiling.py
@@ -18,6 +18,7 @@ import stat
 import time
 import json
 import glob
+from enum import Enum
 
 from mindspore import log as logger, context
 from mindspore.communication.management import GlobalComm, get_rank, get_group_size
@@ -62,6 +63,14 @@ AICORE_METRICS_DICT = {
 }
 
 
+class DeviceSupportParam(Enum):
+    """The device target enum."""
+    CPU = ['start', 'output_path', 'timeline_limit']
+    GPU = ['start', 'output_path', 'data_process', 'timeline_limit', 'sync_enable', 'op_time']
+    ASCEND = ['start', 'output_path', 'data_process', 'timeline_limit', 'profile_memory', 'parallel_strategy',
+              'profile_communication', 'aicore_metrics', 'l2_cache', 'op_time']
+
+
 def _environment_check():
     if c_expression.security.enable_security():
         raise RuntimeError("Profiler is not supported when MindSpore is compiled with \'-s on\'.")
@@ -206,9 +215,9 @@ class Profiler:
             msg = "Do not init twice in the profiler."
             raise RuntimeError(msg)
         Profiler._has_initialized = True
-        self._parser_kwargs(kwargs)
         # get device_id and device_target
         self._get_devid_rankid_and_devtarget()
+        self._parser_kwargs(kwargs)
         self._get_output_path(kwargs)
         self._decide_device_target(kwargs)
         if self.start_profile:
@@ -328,7 +337,6 @@ class Profiler:
         if self._device_target and self._device_target != DeviceTarget.CPU.value and cpu_op_file:
             self._is_heterogeneous = True
         ProfilerInfo.set_analyse_start_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
-        self._init_profiler_info()
         if self._device_target and self._device_target == DeviceTarget.CPU.value:
             self._cpu_analyse()
 
@@ -338,6 +346,7 @@ class Profiler:
         elif self._device_target and self._device_target == DeviceTarget.ASCEND.value:
             self._ascend_analyse()
         logger.info("Profiling: all the data have been analyzed.")
+        self._init_profiler_info()
         ProfilerInfo.set_analyse_end_time(time.strftime("%Y-%m-%d %H:%M:%S", time.localtime()))
         ProfilerInfo.set_rank_size(self._rank_size)
         ProfilerInfo.set_heterogeneous(self._is_heterogeneous)
@@ -474,9 +483,10 @@ class Profiler:
         self._output_path = options.get('file_output_path')
         self._profile_memory = options.get('profile_memory')
         self._parallel_strategy = options.get('parallel_strategy')
-        self._timeline_size_limit_byte = options.get('timeline_limit')
+        self._timeline_size_limit_byte = options.get('timeline_limit') * 1024 * 1024
         self._data_process = options.get('data_process')
         self._profile_communication = options.get('profile_communication')
+        self._op_time = options.get('op_time')
         self._device_target = context.get_context("device_target").lower()
         self._profiler_manager = c_expression.ProfilerManager.get_instance()
         self._cpu_profiler = c_expression.Profiler.get_instance("CPU")
@@ -494,7 +504,8 @@ class Profiler:
         mode = "graph"
         if context.get_context("mode") == context.PYNATIVE_MODE:
             mode = "pynative"
-        ProfilerInfo.init_info(mode, self._rank_id)
+        store_id = self._dev_id if self._device_target == DeviceTarget.GPU.value else self._rank_id
+        ProfilerInfo.init_info(mode, store_id)
 
     def _decide_device_target(self, kwargs):
         """Complete Profiler initialization according to device_target"""
@@ -624,6 +635,7 @@ class Profiler:
         self._sync_enable = kwargs.pop("sync_enable", True)
         if not isinstance(self._sync_enable, bool):
             logger.warning("The parameter sync_enable is an invalid value, it will be set to True.")
+            self._sync_enable = True
 
     def _parse_parameter_for_ascend(self, kwargs):
         """Parse parameter in Proflier when the device target is Ascend."""
@@ -636,8 +648,9 @@ class Profiler:
 
         self._profile_communication = kwargs.pop("profile_communication", False)
         if not isinstance(self._profile_communication, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter profile_communication must be bool, "
-                            f"but got type {type(self._profile_communication)}")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter profile_communication must be bool, "
+                           f"but got type {type(self._profile_communication)}, it will be set to False.")
+            self._profile_communication = False
 
         if self._profile_communication:
             hccl_option = {"output": self._output_path, "task_trace": "on"}
@@ -648,21 +661,26 @@ class Profiler:
 
         self._profile_memory = kwargs.pop("profile_memory", False)
         if not isinstance(self._profile_memory, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter profile_memory must be bool, "
-                            f"but got type '{type(self._profile_memory)}'")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter profile_memory must be bool, "
+                           f"but got type {type(self._profile_memory)}, it will be set to False.")
+            self._profile_memory = False
 
         self._aicore_metrics_id = kwargs.pop("aicore_metrics", 0)
         if not isinstance(self._aicore_metrics_id, int):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
-                            f"but got type {type(self._aicore_metrics_id)}")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be int, "
+                           f"but got type {type(self._aicore_metrics_id)}, it will be set to 0.")
+            self._aicore_metrics_id = 0
+
         if self._aicore_metrics_id not in AICORE_METRICS_DICT:
-            raise ValueError(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
-                             f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter aicore_metrics must be in "
+                           f"[-1, 0, 1, 2, 3, 4, 5], but got {self._aicore_metrics_id}, it will be set to 0.")
+            self._aicore_metrics_id = 0
 
         l2_cache_enable = kwargs.pop("l2_cache", False)
         if not isinstance(l2_cache_enable, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
-                            f"but got type {type(l2_cache_enable)}")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter l2_cache must be bool, "
+                           f"but got type {type(l2_cache_enable)}, it will be set to False.")
+            l2_cache_enable = False
         if l2_cache_enable:
             self._l2_cache = "on"
         else:
@@ -670,15 +688,9 @@ class Profiler:
 
         self._parallel_strategy = kwargs.pop("parallel_strategy", True)
         if not isinstance(self._parallel_strategy, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter parallel_strategy must be bool, "
-                            f"but got type {type(self._parallel_strategy)}")
-
-        self._sync_enable = kwargs.pop("sync_enable", False)
-        if self._sync_enable:
-            logger.warning(f"The parameter sync_enable is not supported on Ascend currently.")
-
-        if kwargs:
-            logger.warning("%s are invalid params which don't work.", kwargs)
+            logger.warning(f"For '{self.__class__.__name__}', the parameter parallel_strategy must be bool, "
+                           f"but got type {type(self._parallel_strategy)}, it will be set to True.")
+            self._parallel_strategy = True
 
         task_sink = os.getenv("GRAPH_OP_RUN")
         if task_sink and task_sink == "1":
@@ -766,10 +778,10 @@ class Profiler:
     def _ascend_dynamic_net_analyse(self):
         """Analyse dynamic shape network info."""
         if self._profile_communication:
-            raise RuntimeError(
+            logger.warning(
                 "The profile_communication parameter cannot be set on the dynamic shape network.")
         if self._profile_memory:
-            raise RuntimeError("The profile_memory parameter cannot be set on the dynamic shape network.")
+            logger.warning("The profile_memory parameter cannot be set on the dynamic shape network.")
         logger.warning(
             "[Profiler]Dynamic Shape network does not support collecting step trace performance data currently.")
         dynamic_parser = DynamicFrameWorkParser(self._output_path, self._rank_id)
@@ -1337,20 +1349,32 @@ class Profiler:
         """Parse kwargs vale."""
         self._data_process = kwargs.pop("data_process", True)
         if not isinstance(self._data_process, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter data_process must be bool, "
-                            f"but got type {type(self._data_process)}")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter data_process must be bool, "
+                           f"but got type {type(self._data_process)}, it will be set to True.")
+            self._data_process = True
 
         self._op_time = kwargs.pop("op_time", True)
         if not isinstance(self._op_time, bool):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter op_time must be bool, "
-                            f"but got type {type(self._op_time)}")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter op_time must be bool, "
+                           f"but got type {type(self._op_time)}, it will be set to True.")
+            self._op_time = True
 
         timeline_limit = kwargs.pop("timeline_limit", 500)
         if not isinstance(timeline_limit, int):
-            raise TypeError(f"For '{self.__class__.__name__}', the parameter timeline_limit must be int, "
-                            f"but got type {type(timeline_limit)}")
+            logger.warning(f"For '{self.__class__.__name__}', the parameter timeline_limit must be int, "
+                           f"but got type {type(timeline_limit)}, it will be set to 500.")
+            timeline_limit = 500
+        if timeline_limit <= 0:
+            logger.warning(
+                "[Profiler]The 'timeline_limit' parameter must be greater than 0, it will be set to 500.")
+            timeline_limit = 500
         self._timeline_size_limit_byte = timeline_limit * 1024 * 1024
 
+        for param in kwargs.keys():
+            if param not in DeviceSupportParam.__getattr__(f'{self._device_target}'.upper()).value \
+                    and kwargs.get(param):
+                logger.warning("%s are invalid param which don't work.", param)
+
     def _analyse_hccl_info(self):
         """Analyse hccl info."""
         hccl_path = os.path.join(self._output_path, "hccl_info_{}".format(self._rank_id))