forked from mindspore-Ecosystem/mindspore
!25470 Fix geting profiling job id fail when there are more than one JOB dirs in profiler output path
Merge pull request !25470 from ougongchang/fix_job_id
This commit is contained in:
commit
b7af2b0cf7
|
@ -130,7 +130,7 @@ def get_file_join_name(input_path, file_name):
|
||||||
return file_join_name
|
return file_join_name
|
||||||
|
|
||||||
|
|
||||||
def get_file_names(input_path, file_name):
|
def get_file_path(input_path, file_name):
|
||||||
"""
|
"""
|
||||||
Search files under the special path.
|
Search files under the special path.
|
||||||
|
|
||||||
|
@ -139,20 +139,18 @@ def get_file_names(input_path, file_name):
|
||||||
file_name (str): The target of the filename, such as 'host_start_log'.
|
file_name (str): The target of the filename, such as 'host_start_log'.
|
||||||
|
|
||||||
Returns:
|
Returns:
|
||||||
list, file name list.
|
str, a special file path. If there can not find the special path, will return None.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
input_path = os.path.realpath(input_path)
|
input_path = os.path.realpath(input_path)
|
||||||
name_list = []
|
|
||||||
if os.path.exists(input_path):
|
if os.path.exists(input_path):
|
||||||
files = os.listdir(input_path)
|
files = os.listdir(input_path)
|
||||||
for f in files:
|
for f in files:
|
||||||
if file_name in f and not f.endswith('.done') \
|
if file_name in f and not f.endswith('.done') \
|
||||||
and not f.endswith('.zip'):
|
and not f.endswith('.zip'):
|
||||||
name_list.append(f)
|
return os.path.join(input_path, f)
|
||||||
break
|
|
||||||
|
|
||||||
return name_list
|
return None
|
||||||
|
|
||||||
|
|
||||||
def parse_device_id(filename, device_id_list, profiler_file_prefix):
|
def parse_device_id(filename, device_id_list, profiler_file_prefix):
|
||||||
|
|
|
@ -25,7 +25,7 @@ import mindspore._c_expression as c_expression
|
||||||
from mindspore.dataset.core.config import _stop_dataset_profiler
|
from mindspore.dataset.core.config import _stop_dataset_profiler
|
||||||
from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
|
from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
|
||||||
ProfilerIOException, ProfilerException, ProfilerRawFileException
|
ProfilerIOException, ProfilerException, ProfilerRawFileException
|
||||||
from mindspore.profiler.common.util import get_file_names, fwrite_format
|
from mindspore.profiler.common.util import get_file_path, fwrite_format
|
||||||
from mindspore.profiler.common.validator.validate_path import \
|
from mindspore.profiler.common.validator.validate_path import \
|
||||||
validate_and_normalize_path
|
validate_and_normalize_path
|
||||||
from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
|
from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
|
||||||
|
@ -523,35 +523,44 @@ class Profiler:
|
||||||
"""
|
"""
|
||||||
|
|
||||||
job_id = ""
|
job_id = ""
|
||||||
|
job_dirs = filter(lambda item: item.startswith('JOB') and os.path.isdir(os.path.join(self._output_path, item)),
|
||||||
|
os.listdir(self._output_path))
|
||||||
|
sorted_job_dirs = sorted(job_dirs, key=lambda x: os.path.getmtime(os.path.join(self._output_path, x)),
|
||||||
|
reverse=True)
|
||||||
|
|
||||||
for item in os.listdir(self._output_path):
|
for dir_name in sorted_job_dirs:
|
||||||
if item.startswith('JOB'):
|
job_dir = os.path.join(self._output_path, dir_name)
|
||||||
path = os.path.join(self._output_path, item)
|
host_start_file_path = get_file_path(job_dir, "host_start.log")
|
||||||
|
if host_start_file_path is None:
|
||||||
|
logger.warning("Find profiling job path %s, but host_start.log not exist, "
|
||||||
|
"profiler will ignore this job dir.", job_dir)
|
||||||
|
continue
|
||||||
|
|
||||||
log_file = get_file_names(path, "host_start.log")
|
training_device_id = host_start_file_path.split('.')[-1]
|
||||||
if not log_file:
|
if self._dev_id != training_device_id:
|
||||||
logger.error("Profiling: job path %s, host_start.log not exist.", path)
|
logger.warning("Find profiling find job path %s, but not current training device id. "
|
||||||
continue
|
"Current training device id %s, but job path device id: %s, "
|
||||||
|
"profiler will ignore this job dir.", job_dir, self._dev_id, training_device_id)
|
||||||
|
continue
|
||||||
|
|
||||||
training_device_id = log_file[0].split('.')[-1]
|
job_start_time = self._parse_host_start_log(host_start_file_path)
|
||||||
if self._dev_id == training_device_id:
|
if not job_start_time:
|
||||||
log_file = os.path.join(path, log_file[0])
|
logger.warning("Find profiling job path %s, but fail to get job start info, "
|
||||||
job_start_time = self._parse_host_start_log(log_file)
|
"profiler will ignore this job dir.", job_start_time)
|
||||||
if not job_start_time:
|
continue
|
||||||
logger.error("Profiling: job path %s, fail to get job start info.", path)
|
|
||||||
break
|
if int(job_start_time) < self._start_time:
|
||||||
job_id = item
|
logger.warning("Find profiling job path %s, but start_time(%d) is earlier than this training "
|
||||||
if self._start_time > int(job_start_time):
|
"start_time(%d), profiler will ignore this job dir.",
|
||||||
logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
|
job_dir, job_start_time, self._start_time)
|
||||||
path, job_start_time, self._start_time)
|
continue
|
||||||
break
|
|
||||||
else:
|
job_id = dir_name
|
||||||
logger.info("Profiling: job path %s, dev id %s, training device id %s.",
|
break
|
||||||
path, training_device_id, self._dev_id)
|
|
||||||
|
|
||||||
if not job_id:
|
if not job_id:
|
||||||
msg = "Fail to get profiling job, output path is {}, " \
|
msg = "Fail to get profiling job, output path is {}, " \
|
||||||
"please check whether job dir in output path was generated, " \
|
"please check whether job dir(name startswith JOB) in output path was generated, " \
|
||||||
"or may be the device id from job dir dismatch the " \
|
"or may be the device id from job dir dismatch the " \
|
||||||
"device_id in current process.".format(self._output_path)
|
"device_id in current process.".format(self._output_path)
|
||||||
raise RuntimeError(msg)
|
raise RuntimeError(msg)
|
||||||
|
|
Loading…
Reference in New Issue