forked from mindspore-Ecosystem/mindspore
!3075 Add profiler module.
Merge pull request !3075 from yuximiao/yuximiao_mindspore_profiler
This commit is contained in:
commit
be2cfa9ed6
|
@ -216,6 +216,7 @@ install(
|
|||
${CMAKE_SOURCE_DIR}/mindspore/common
|
||||
${CMAKE_SOURCE_DIR}/mindspore/ops
|
||||
${CMAKE_SOURCE_DIR}/mindspore/communication
|
||||
${CMAKE_SOURCE_DIR}/mindspore/profiler
|
||||
DESTINATION ${INSTALL_PY_DIR}
|
||||
COMPONENT mindspore
|
||||
)
|
||||
|
|
|
@ -0,0 +1,27 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
Profiler Module Introduction.
|
||||
|
||||
This module provides Python APIs to enable the profiling of MindSpore neural networks.
|
||||
Users can import the mindspore.profiler.Profiler, initialize the Profiler object to start profiling,
|
||||
and use Profiler.analyse() to stop profiling and analyse the results.
|
||||
To visualize the profiling results, users can open mindspore Web, find the corresponding run
|
||||
and click the profile link.
|
||||
Now, Profiler supports the AICore operator analysis.
|
||||
"""
|
||||
from mindspore.profiler.profiling import Profiler
|
||||
|
||||
__all__ = ["Profiler"]
|
|
@ -0,0 +1,14 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
|
@ -0,0 +1,14 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
|
@ -0,0 +1,85 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Profiler error code and messages."""
|
||||
from enum import unique, Enum
|
||||
|
||||
|
||||
_GENERAL_MASK = 0b00001 << 7
|
||||
_PARSER_MASK = 0b00010 << 7
|
||||
_ANALYSER_MASK = 0b00011 << 7
|
||||
|
||||
|
||||
class ProfilerMgrErrors(Enum):
|
||||
"""Enum definition for profiler errors"""
|
||||
|
||||
@unique
|
||||
class ProfilerErrors(ProfilerMgrErrors):
|
||||
"""Profiler error codes."""
|
||||
# general error code
|
||||
PARAM_VALUE_ERROR = 0 | _GENERAL_MASK
|
||||
PATH_ERROR = 1 | _GENERAL_MASK
|
||||
PARAM_TYPE_ERROR = 2 | _GENERAL_MASK
|
||||
DIR_NOT_FOUND_ERROR = 3 | _GENERAL_MASK
|
||||
FILE_NOT_FOUND_ERROR = 4 | _GENERAL_MASK
|
||||
IO_ERROR = 5 | _GENERAL_MASK
|
||||
|
||||
# parser error code
|
||||
DEVICE_ID_MISMATCH_ERROR = 0 | _PARSER_MASK
|
||||
RAW_FILE_ERROR = 1 | _PARSER_MASK
|
||||
STEP_NUM_NOT_SUPPORTED_ERROR = 2 | _PARSER_MASK
|
||||
JOB_ID_MISMATCH_ERROR = 3 | _PARSER_MASK
|
||||
|
||||
# analyser error code
|
||||
COLUMN_NOT_EXIST_ERROR = 0 | _ANALYSER_MASK
|
||||
ANALYSER_NOT_EXIST_ERROR = 1 | _ANALYSER_MASK
|
||||
DEVICE_ID_ERROR = 2 | _ANALYSER_MASK
|
||||
OP_TYPE_ERROR = 3 | _ANALYSER_MASK
|
||||
GROUP_CONDITION_ERROR = 4 | _ANALYSER_MASK
|
||||
SORT_CONDITION_ERROR = 5 | _ANALYSER_MASK
|
||||
FILTER_CONDITION_ERROR = 6 | _ANALYSER_MASK
|
||||
COLUMN_NOT_SUPPORT_SORT_ERROR = 7 | _ANALYSER_MASK
|
||||
PIPELINE_OP_NOT_EXIST_ERROR = 8 | _ANALYSER_MASK
|
||||
|
||||
|
||||
|
||||
|
||||
@unique
|
||||
class ProfilerErrorMsg(Enum):
|
||||
"""Profiler error messages."""
|
||||
# general error msg
|
||||
PARAM_VALUE_ERROR = 'Param value error. {}'
|
||||
PATH_ERROR = 'Path error. {}'
|
||||
PARAM_TYPE_ERROR = 'Param type error. {}'
|
||||
DIR_NOT_FOUND_ERROR = 'The dir <{}> not found.'
|
||||
FILE_NOT_FOUND_ERROR = 'The file <{}> not found.'
|
||||
IO_ERROR = 'Read or write file fail.'
|
||||
|
||||
# parser error msg
|
||||
DEVICE_ID_MISMATCH_ERROR = 'The device ID mismatch.'
|
||||
RAW_FILE_ERROR = 'Raw file error. {}'
|
||||
STEP_NUM_NOT_SUPPORTED_ERROR = 'The step num must be in {}'
|
||||
JOB_ID_MISMATCH_ERROR = 'The job id in the parameter is not the same as ' \
|
||||
'in the training trace file. '
|
||||
|
||||
# analyser error msg
|
||||
COLUMN_NOT_EXIST_ERROR = 'The column {} does not exist.'
|
||||
ANALYSER_NOT_EXIST_ERROR = 'The analyser {} does not exist.'
|
||||
DEIVICE_ID_ERROR = 'The device_id in search_condition error, {}'
|
||||
FILTER_CONDITION_ERROR = 'The filter_condition in search_condition error, {}'
|
||||
OP_TYPE_ERROR = 'The op_type in search_condition error, {}'
|
||||
GROUP_CONDITION_ERROR = 'The group_condition in search_condition error, {}'
|
||||
SORT_CONDITION_ERROR = 'The sort_condition in search_condition error, {}'
|
||||
COLUMN_NOT_SUPPORT_SORT_ERROR = 'The column {} does not support to sort.'
|
||||
PIPELINE_OP_NOT_EXIST_ERROR = 'The minddata pipeline operator {} does not exist.'
|
|
@ -0,0 +1,287 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Definition of error code and relative messages in profiler module."""
|
||||
from mindspore.profiler.common.exceptions.error_code import ProfilerErrors, \
|
||||
ProfilerErrorMsg
|
||||
|
||||
|
||||
class ProfilerException(Exception):
|
||||
"""
|
||||
Base class for Profilier exception.
|
||||
|
||||
Examples:
|
||||
>>> raise ProfilerException(GeneralErrors.PATH_NOT_EXISTS_ERROR, 'path not exists')
|
||||
"""
|
||||
|
||||
RUNTIME = 1
|
||||
TYPE = 1
|
||||
LEVEL = 0
|
||||
SYSID = 42
|
||||
|
||||
def __init__(self, error, message, http_code=500):
|
||||
"""
|
||||
Initialization of ProfilerException.
|
||||
|
||||
Args:
|
||||
error (Enum): Error value for specified case.
|
||||
message (str): Description for exception.
|
||||
http_code (int): Http code for exception. Default is 500.
|
||||
"""
|
||||
if isinstance(message, str):
|
||||
message = ' '.join(message.split())
|
||||
super(ProfilerException, self).__init__(message)
|
||||
self.error = error
|
||||
self.message = message
|
||||
self.http_code = http_code
|
||||
|
||||
|
||||
@property
|
||||
def error_code(self):
|
||||
"""
|
||||
Transform exception no to Profiler error code.
|
||||
|
||||
code compose(4bytes):
|
||||
runtime 2bits, type 2bits, level 3bits, sysid 8bits, modid 5bits, value 12bits.
|
||||
|
||||
num = ((0xFF & runtime) << 30) \
|
||||
| ((0xFF & type) << 28) \
|
||||
| ((0xFF & level) << 25) \
|
||||
| ((0xFF & sysid) << 17) \
|
||||
| ((0xFF & modid) << 12) \
|
||||
| (0x0FFF & value)
|
||||
|
||||
Returns:
|
||||
str, Hex string representing the composed Profiler error code.
|
||||
"""
|
||||
num = (((0xFF & self.RUNTIME) << 30)
|
||||
| ((0xFF & self.TYPE) << 28)
|
||||
| ((0xFF & self.LEVEL) << 25)
|
||||
| ((0xFF & self.SYSID) << 17)
|
||||
| ((0xFF & 6) << 12)
|
||||
| (0x0FFF & self.error.value))
|
||||
|
||||
return hex(num)[2:].zfill(8).upper()
|
||||
|
||||
def __str__(self):
|
||||
return '[{}] code: {}, msg: {}'.format(self.__class__.__name__, self.error_code, self.message)
|
||||
|
||||
|
||||
class ProfilerParamValueErrorException(ProfilerException):
|
||||
"""The parameter value error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerParamValueErrorException, self).__init__(
|
||||
error=ProfilerErrors.PARAM_VALUE_ERROR,
|
||||
message=ProfilerErrorMsg.PARAM_VALUE_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerPathErrorException(ProfilerException):
|
||||
"""The path error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerPathErrorException, self).__init__(
|
||||
error=ProfilerErrors.PATH_ERROR,
|
||||
message=ProfilerErrorMsg.PATH_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerParamTypeErrorException(ProfilerException):
|
||||
"""The parameter type error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerParamTypeErrorException, self).__init__(
|
||||
error=ProfilerErrors.PARAM_TYPE_ERROR,
|
||||
message=ProfilerErrorMsg.PARAM_TYPE_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerDirNotFoundException(ProfilerException):
|
||||
"""The dir not found exception in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerDirNotFoundException, self).__init__(
|
||||
error=ProfilerErrors.DIR_NOT_FOUND_ERROR,
|
||||
message=ProfilerErrorMsg.DIR_NOT_FOUND_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerFileNotFoundException(ProfilerException):
|
||||
"""The file not found exception in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerFileNotFoundException, self).__init__(
|
||||
error=ProfilerErrors.FILE_NOT_FOUND_ERROR,
|
||||
message=ProfilerErrorMsg.FILE_NOT_FOUND_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerIOException(ProfilerException):
|
||||
"""The IO exception in profiler module."""
|
||||
|
||||
def __init__(self):
|
||||
super(ProfilerIOException, self).__init__(
|
||||
error=ProfilerErrors.IO_ERROR,
|
||||
message=ProfilerErrorMsg.IO_ERROR.value,
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerDeviceIdMismatchException(ProfilerException):
|
||||
"""The device id mismatch exception in profiler module."""
|
||||
|
||||
def __init__(self):
|
||||
super(ProfilerDeviceIdMismatchException, self).__init__(
|
||||
error=ProfilerErrors.DEVICE_ID_MISMATCH_ERROR,
|
||||
message=ProfilerErrorMsg.DEVICE_ID_MISMATCH_ERROR.value,
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerRawFileException(ProfilerException):
|
||||
"""The raw file exception in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerRawFileException, self).__init__(
|
||||
error=ProfilerErrors.RAW_FILE_ERROR,
|
||||
message=ProfilerErrorMsg.RAW_FILE_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerColumnNotExistException(ProfilerException):
|
||||
"""The column does not exist exception in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerColumnNotExistException, self).__init__(
|
||||
error=ProfilerErrors.COLUMN_NOT_EXIST_ERROR,
|
||||
message=ProfilerErrorMsg.COLUMN_NOT_EXIST_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerAnalyserNotExistException(ProfilerException):
|
||||
"""The analyser in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerAnalyserNotExistException, self).__init__(
|
||||
error=ProfilerErrors.ANALYSER_NOT_EXIST_ERROR,
|
||||
message=ProfilerErrorMsg.ANALYSER_NOT_EXIST_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerDeviceIdException(ProfilerException):
|
||||
"""The parameter device_id error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerDeviceIdException, self).__init__(
|
||||
error=ProfilerErrors.DEVICE_ID_ERROR,
|
||||
message=ProfilerErrorMsg.DEIVICE_ID_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerOpTypeException(ProfilerException):
|
||||
"""The parameter op_type error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerOpTypeException, self).__init__(
|
||||
error=ProfilerErrors.OP_TYPE_ERROR,
|
||||
message=ProfilerErrorMsg.OP_TYPE_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerSortConditionException(ProfilerException):
|
||||
"""The parameter sort_condition error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerSortConditionException, self).__init__(
|
||||
error=ProfilerErrors.SORT_CONDITION_ERROR,
|
||||
message=ProfilerErrorMsg.SORT_CONDITION_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerFilterConditionException(ProfilerException):
|
||||
"""The parameter filer_condition error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerFilterConditionException, self).__init__(
|
||||
error=ProfilerErrors.FILTER_CONDITION_ERROR,
|
||||
message=ProfilerErrorMsg.FILTER_CONDITION_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerGroupConditionException(ProfilerException):
|
||||
"""The parameter group_condition error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerGroupConditionException, self).__init__(
|
||||
error=ProfilerErrors.GROUP_CONDITION_ERROR,
|
||||
message=ProfilerErrorMsg.GROUP_CONDITION_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerColumnNotSupportSortException(ProfilerException):
|
||||
"""The column does not support to sort error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerColumnNotSupportSortException, self).__init__(
|
||||
error=ProfilerErrors.COLUMN_NOT_SUPPORT_SORT_ERROR,
|
||||
message=ProfilerErrorMsg.COLUMN_NOT_SUPPORT_SORT_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class StepNumNotSupportedException(ProfilerException):
|
||||
"""The step number error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(StepNumNotSupportedException, self).__init__(
|
||||
error=ProfilerErrors.STEP_NUM_NOT_SUPPORTED_ERROR,
|
||||
message=ProfilerErrorMsg.STEP_NUM_NOT_SUPPORTED_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class JobIdMismatchException(ProfilerException):
|
||||
"""The Job ID mismatch error in profiler module."""
|
||||
|
||||
def __init__(self):
|
||||
super(JobIdMismatchException, self).__init__(
|
||||
error=ProfilerErrors.JOB_ID_MISMATCH_ERROR,
|
||||
message=ProfilerErrorMsg.JOB_ID_MISMATCH_ERROR.value,
|
||||
http_code=400
|
||||
)
|
||||
|
||||
|
||||
class ProfilerPipelineOpNotExistException(ProfilerException):
|
||||
"""The minddata pipeline operator does not exist error in profiler module."""
|
||||
|
||||
def __init__(self, msg):
|
||||
super(ProfilerPipelineOpNotExistException, self).__init__(
|
||||
error=ProfilerErrors.PIPELINE_OP_NOT_EXIST_ERROR,
|
||||
message=ProfilerErrorMsg.PIPELINE_OP_NOT_EXIST_ERROR.value.format(msg),
|
||||
http_code=400
|
||||
)
|
|
@ -0,0 +1,295 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
Profiler util.
|
||||
|
||||
This module provides the utils.
|
||||
"""
|
||||
import os
|
||||
|
||||
|
||||
# one sys count takes 10 ns, 1 ms has 100000 system count
|
||||
import re
|
||||
|
||||
PER_MS_SYSCNT = 100000
|
||||
|
||||
|
||||
def to_int(param, param_name):
|
||||
"""
|
||||
Transfer param to int type.
|
||||
|
||||
Args:
|
||||
param (Any): A param transformed.
|
||||
param_name (str): Param name.
|
||||
|
||||
Returns:
|
||||
int, value after transformed.
|
||||
|
||||
"""
|
||||
try:
|
||||
param = int(param)
|
||||
except ValueError:
|
||||
raise TypeError('Must be Integer: ' + param_name)
|
||||
return param
|
||||
|
||||
|
||||
def fwrite_format(output_data_path, data_source=None, is_print=False, is_start=False):
|
||||
"""
|
||||
Write data to the output file.
|
||||
|
||||
Args:
|
||||
output_data_path (str): The output file path of the data.
|
||||
data_source (str, list, tuple): The data to write.
|
||||
is_print (bool): whether to print the data to stdout.
|
||||
is_start (bool): Whether is the first line of the output file, will remove the old file if True."
|
||||
"""
|
||||
|
||||
if is_start is True and os.path.exists(output_data_path):
|
||||
os.remove(output_data_path)
|
||||
|
||||
if isinstance(data_source, str) and data_source.startswith("title:"):
|
||||
title_label = '=' * 20
|
||||
data_source = title_label + data_source[6:] + title_label
|
||||
|
||||
with open(output_data_path, 'a+') as f:
|
||||
if isinstance(data_source, (list, tuple)):
|
||||
for raw_data in data_source:
|
||||
if isinstance(raw_data, (list, tuple)):
|
||||
raw_data = map(str, raw_data)
|
||||
raw_data = " ".join(raw_data)
|
||||
f.write(raw_data)
|
||||
f.write("\n")
|
||||
else:
|
||||
f.write(data_source)
|
||||
f.write("\n")
|
||||
|
||||
if is_print:
|
||||
if isinstance(data_source, (list, tuple)):
|
||||
for raw_data in data_source:
|
||||
if isinstance(raw_data, (list, tuple)):
|
||||
raw_data = map(str, raw_data)
|
||||
raw_data = " ".join(raw_data)
|
||||
print(raw_data)
|
||||
else:
|
||||
print(data_source)
|
||||
|
||||
|
||||
def get_log_slice_id(file_name):
|
||||
pattern = re.compile(r'(?<=slice_)\d+')
|
||||
slice_list = pattern.findall(file_name)
|
||||
index = re.findall(r'\d+', slice_list[0])
|
||||
return int(index[0])
|
||||
|
||||
|
||||
def get_file_join_name(input_path, file_name):
|
||||
"""
|
||||
Search files under the special path, and will join all the files to one file.
|
||||
|
||||
Args:
|
||||
input_path (str): The source path, will search files under it.
|
||||
file_name (str): The target of the filename, such as 'hwts.log.data.45.dev'.
|
||||
|
||||
Returns:
|
||||
str, the join file name.
|
||||
"""
|
||||
name_list = []
|
||||
file_join_name = ''
|
||||
input_path = os.path.realpath(input_path)
|
||||
if os.path.exists(input_path):
|
||||
files = os.listdir(input_path)
|
||||
for f in files:
|
||||
if file_name in f and not f.endswith('.done') and not f.endswith('.join') \
|
||||
and not f.endswith('.zip'):
|
||||
name_list.append(f)
|
||||
|
||||
# resort name_list
|
||||
name_list.sort(key=get_log_slice_id)
|
||||
|
||||
if len(name_list) == 1:
|
||||
file_join_name = os.path.join(input_path, name_list[0])
|
||||
elif len(name_list) > 1:
|
||||
file_join_name = os.path.join(input_path, '%s.join' % file_name)
|
||||
if os.path.exists(file_join_name):
|
||||
os.remove(file_join_name)
|
||||
with open(file_join_name, 'ab') as bin_data:
|
||||
for i in name_list:
|
||||
file = input_path + os.sep + i
|
||||
with open(file, 'rb') as txt:
|
||||
bin_data.write(txt.read())
|
||||
return file_join_name
|
||||
|
||||
def get_file_names(input_path, file_name):
|
||||
"""
|
||||
Search files under the special path.
|
||||
|
||||
Args:
|
||||
input_path (str): The source path, will search files under it.
|
||||
file_name (str): The target of the filename, such as 'host_start_log'.
|
||||
|
||||
Returns:
|
||||
list, file name list.
|
||||
"""
|
||||
|
||||
input_path = os.path.realpath(input_path)
|
||||
name_list = []
|
||||
if os.path.exists(input_path):
|
||||
files = os.listdir(input_path)
|
||||
for f in files:
|
||||
if file_name in f and not f.endswith('.done') \
|
||||
and not f.endswith('.zip'):
|
||||
name_list.append(f)
|
||||
break
|
||||
|
||||
return name_list
|
||||
|
||||
|
||||
def analyse_device_list_from_profiler_dir(profiler_dir):
|
||||
"""
|
||||
Analyse device list from profiler dir.
|
||||
|
||||
Args:
|
||||
profiler_dir (str): The profiler data dir.
|
||||
|
||||
Returns:
|
||||
list, the device_id list.
|
||||
"""
|
||||
profiler_file_prefix = ["timeline_display", "output_op_compute_time"]
|
||||
|
||||
device_id_list = set()
|
||||
for _, _, filenames in os.walk(profiler_dir):
|
||||
for filename in filenames:
|
||||
if filename.startswith("step_trace_raw"):
|
||||
items = filename.split("_")
|
||||
device_num = ""
|
||||
if len(items) > 3:
|
||||
device_num = items[3]
|
||||
else:
|
||||
items = filename.split("_")
|
||||
device_num = items[-1].split(".")[0] if items[-1].split(".") else ""
|
||||
|
||||
if device_num.isdigit() and '_'.join(items[:-1]) in profiler_file_prefix:
|
||||
device_id_list.add(device_num)
|
||||
|
||||
return sorted(list(device_id_list))
|
||||
|
||||
|
||||
def query_latest_trace_time_file(profiler_dir, device_id=0):
|
||||
"""
|
||||
Query the latest trace time file.
|
||||
|
||||
Args:
|
||||
profiler_dir (str): The profiler directory.
|
||||
device_id (int): The id of device.
|
||||
|
||||
Returns:
|
||||
str, the latest trace time file path.
|
||||
"""
|
||||
files = os.listdir(profiler_dir)
|
||||
target_file = f'step_trace_raw_{device_id}_detail_time.csv'
|
||||
try:
|
||||
latest_file = max(
|
||||
filter(
|
||||
lambda file: file == target_file,
|
||||
files
|
||||
),
|
||||
key=lambda file: os.stat(os.path.join(profiler_dir, file)).st_mtime
|
||||
)
|
||||
except ValueError:
|
||||
return None
|
||||
return os.path.join(profiler_dir, latest_file)
|
||||
|
||||
|
||||
def query_step_trace_file(profiler_dir):
|
||||
"""
|
||||
Query for all step trace file.
|
||||
|
||||
Args:
|
||||
profiler_dir (str): The directory that contains all step trace files.
|
||||
|
||||
Returns:
|
||||
str, the file path of step trace time.
|
||||
"""
|
||||
files = os.listdir(profiler_dir)
|
||||
training_trace_file = list(
|
||||
filter(
|
||||
lambda file: file.startswith('training_trace') and not file.endswith('.done'),
|
||||
files
|
||||
)
|
||||
)
|
||||
if training_trace_file:
|
||||
return os.path.join(profiler_dir, training_trace_file[0])
|
||||
return None
|
||||
|
||||
|
||||
def get_summary_for_step_trace(average_info, header):
|
||||
"""The property of summary info."""
|
||||
if not average_info or not header:
|
||||
return {}
|
||||
total_time = get_field_value(average_info, 'total', header)
|
||||
iteration_interval = get_field_value(average_info, 'iteration_interval',
|
||||
header)
|
||||
fp_and_bp = get_field_value(average_info, 'fp_and_bp', header)
|
||||
tail = get_field_value(average_info, 'tail', header)
|
||||
summary = {
|
||||
'total_time': total_time,
|
||||
'iteration_interval': iteration_interval,
|
||||
'iteration_interval_percent': calculate_percent(iteration_interval, total_time),
|
||||
'fp_and_bp': fp_and_bp,
|
||||
'fp_and_bp_percent': calculate_percent(fp_and_bp, total_time),
|
||||
'tail': tail,
|
||||
'tail_percent': calculate_percent(tail, total_time)
|
||||
}
|
||||
return summary
|
||||
|
||||
|
||||
def calculate_percent(partial, total):
|
||||
"""Calculate percent value."""
|
||||
if total:
|
||||
percent = round(partial / total * 100, 2)
|
||||
else:
|
||||
percent = 0
|
||||
return f'{percent}%'
|
||||
|
||||
|
||||
def to_millisecond(sys_count, limit=4):
|
||||
"""Translate system count to millisecond."""
|
||||
return round(sys_count / PER_MS_SYSCNT, limit)
|
||||
|
||||
|
||||
def get_field_value(row_info, field_name, header, time_type='realtime'):
|
||||
"""
|
||||
Extract basic info through row_info.
|
||||
|
||||
Args:
|
||||
row_info (list): The list of data info in one row.
|
||||
field_name (str): The name in header.
|
||||
header (list[str]): The list of field names.
|
||||
time_type (str): The type of value, `realtime` or `systime`. Default: `realtime`.
|
||||
|
||||
Returns:
|
||||
dict, step trace info in dict format.
|
||||
"""
|
||||
field_index = header.index(field_name)
|
||||
value = row_info[field_index]
|
||||
value = to_int(value, field_name)
|
||||
if time_type == 'realtime':
|
||||
value = to_millisecond(value)
|
||||
|
||||
return value
|
||||
|
||||
def get_options(options):
|
||||
if options is None:
|
||||
options = {}
|
||||
return options
|
|
@ -0,0 +1,14 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
|
@ -0,0 +1,26 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Profiler check parameters."""
|
||||
def check_bool(input_param, param_name):
|
||||
"""Bool type judgment."""
|
||||
if isinstance(input_param, bool):
|
||||
return input_param
|
||||
raise TypeError("Parameter {}: input type must be bool!".format(param_name))
|
||||
|
||||
def check_subgraph(subgraph):
|
||||
"""Check subgraph."""
|
||||
if subgraph in ("all", "Default", "Gradients"):
|
||||
return subgraph
|
||||
raise ValueError("subgraph must be all or Default or Gradients, but got {}.".format(subgraph))
|
|
@ -0,0 +1,307 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Validate the profiler parameters."""
|
||||
import os
|
||||
import sys
|
||||
|
||||
|
||||
from mindspore.profiler.common.exceptions.exceptions import ProfilerParamTypeErrorException, \
|
||||
ProfilerDeviceIdException, ProfilerOpTypeException, \
|
||||
ProfilerSortConditionException, ProfilerFilterConditionException, \
|
||||
ProfilerGroupConditionException, ProfilerParamValueErrorException
|
||||
from mindspore import log
|
||||
from mindspore.profiler.common.util import to_int
|
||||
|
||||
AICORE_TYPE_COL = ["op_type", "execution_time", "execution_frequency", "precent"]
|
||||
AICORE_DETAIL_COL = ["op_name", "op_type", "avg_execution_time", "subgraph", "full_op_name"]
|
||||
AICPU_COL = ["serial_number", "op_type", "total_time", "dispatch_time", "run_start",
|
||||
"run_end"]
|
||||
MINDDATA_PIPELINE_COL = [
|
||||
'op_id', 'op_type', 'num_workers', 'output_queue_average_size',
|
||||
'output_queue_length', 'output_queue_usage_rate', 'sample_interval',
|
||||
'parent_id'
|
||||
]
|
||||
|
||||
|
||||
def validate_condition(search_condition):
|
||||
"""
|
||||
Verify the param in search_condition is valid or not.
|
||||
|
||||
Args:
|
||||
search_condition (dict): The search condition.
|
||||
|
||||
Raises:
|
||||
ProfilerParamTypeErrorException: If the type of the param in search_condition is invalid.
|
||||
ProfilerDeviceIdException: If the device_id param in search_condition is invalid.
|
||||
ProfilerOpTypeException: If the op_type param in search_condition is invalid.
|
||||
ProfilerGroupConditionException: If the group_condition param in search_condition is invalid.
|
||||
ProfilerSortConditionException: If the sort_condition param in search_condition is invalid.
|
||||
ProfilerFilterConditionException: If the filter_condition param in search_condition is invalid.
|
||||
"""
|
||||
if not isinstance(search_condition, dict):
|
||||
log.error("Invalid search_condition type, it should be dict.")
|
||||
raise ProfilerParamTypeErrorException(
|
||||
"Invalid search_condition type, it should be dict.")
|
||||
|
||||
if "device_id" in search_condition:
|
||||
device_id = search_condition.get("device_id")
|
||||
if not isinstance(device_id, str):
|
||||
raise ProfilerDeviceIdException("Invalid device_id type, it should be str.")
|
||||
|
||||
if "op_type" in search_condition:
|
||||
op_type = search_condition.get("op_type")
|
||||
if op_type == "aicpu":
|
||||
search_scope = AICPU_COL
|
||||
elif op_type == "aicore_type":
|
||||
search_scope = AICORE_TYPE_COL
|
||||
elif op_type == "aicore_detail":
|
||||
search_scope = AICORE_DETAIL_COL
|
||||
else:
|
||||
raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']")
|
||||
else:
|
||||
raise ProfilerOpTypeException("The op_type must in ['aicpu', 'aicore_type', 'aicore_detail']")
|
||||
|
||||
if "group_condition" in search_condition:
|
||||
validate_group_condition(search_condition)
|
||||
|
||||
if "sort_condition" in search_condition:
|
||||
validate_sort_condition(search_condition, search_scope)
|
||||
|
||||
if "filter_condition" in search_condition:
|
||||
validate_filter_condition(search_condition)
|
||||
|
||||
|
||||
def validate_group_condition(search_condition):
|
||||
"""
|
||||
Verify the group_condition in search_condition is valid or not.
|
||||
|
||||
Args:
|
||||
search_condition (dict): The search condition.
|
||||
|
||||
Raises:
|
||||
ProfilerGroupConditionException: If the group_condition param in search_condition is invalid.
|
||||
"""
|
||||
group_condition = search_condition.get("group_condition")
|
||||
if not isinstance(group_condition, dict):
|
||||
raise ProfilerGroupConditionException("The group condition must be dict.")
|
||||
if "limit" in group_condition:
|
||||
limit = group_condition.get("limit", 10)
|
||||
if isinstance(limit, bool) \
|
||||
or not isinstance(group_condition.get("limit"), int):
|
||||
log.error("The limit must be int.")
|
||||
raise ProfilerGroupConditionException("The limit must be int.")
|
||||
if limit < 1 or limit > 100:
|
||||
raise ProfilerGroupConditionException("The limit must in [1, 100].")
|
||||
|
||||
if "offset" in group_condition:
|
||||
offset = group_condition.get("offset", 0)
|
||||
if isinstance(offset, bool) \
|
||||
or not isinstance(group_condition.get("offset"), int):
|
||||
log.error("The offset must be int.")
|
||||
raise ProfilerGroupConditionException("The offset must be int.")
|
||||
if offset < 0:
|
||||
raise ProfilerGroupConditionException("The offset must ge 0.")
|
||||
|
||||
if offset > 1000000:
|
||||
raise ProfilerGroupConditionException("The offset must le 1000000.")
|
||||
|
||||
|
||||
def validate_sort_condition(search_condition, search_scope):
|
||||
"""
|
||||
Verify the sort_condition in search_condition is valid or not.
|
||||
|
||||
Args:
|
||||
search_condition (dict): The search condition.
|
||||
search_scope (list): The search scope.
|
||||
|
||||
Raises:
|
||||
ProfilerSortConditionException: If the sort_condition param in search_condition is invalid.
|
||||
"""
|
||||
sort_condition = search_condition.get("sort_condition")
|
||||
if not isinstance(sort_condition, dict):
|
||||
raise ProfilerSortConditionException("The sort condition must be dict.")
|
||||
if "name" in sort_condition:
|
||||
sorted_name = sort_condition.get("name", "")
|
||||
err_msg = "The sorted_name must be in {}".format(search_scope)
|
||||
if not isinstance(sorted_name, str):
|
||||
log.error("Wrong sorted name type.")
|
||||
raise ProfilerSortConditionException("Wrong sorted name type.")
|
||||
if sorted_name not in search_scope:
|
||||
log.error(err_msg)
|
||||
raise ProfilerSortConditionException(err_msg)
|
||||
|
||||
if "type" in sort_condition:
|
||||
sorted_type_param = ['ascending', 'descending']
|
||||
sorted_type = sort_condition.get("type")
|
||||
if sorted_type and sorted_type not in sorted_type_param:
|
||||
err_msg = "The sorted type must be ascending or descending."
|
||||
log.error(err_msg)
|
||||
raise ProfilerSortConditionException(err_msg)
|
||||
|
||||
|
||||
def validate_op_filter_condition(op_condition, value_type=str, value_type_msg='str'):
|
||||
"""
|
||||
Verify the op_condition in filter_condition is valid or not.
|
||||
|
||||
Args:
|
||||
op_condition (dict): The op_condition in search_condition.
|
||||
value_type (type): The value type. Default: str.
|
||||
value_type_msg (str): The value type message. Default: 'str'.
|
||||
|
||||
Raises:
|
||||
ProfilerFilterConditionException: If the filter_condition param in search_condition is invalid.
|
||||
"""
|
||||
filter_key = ["in", "not_in", "partial_match_str_in"]
|
||||
if not isinstance(op_condition, dict):
|
||||
raise ProfilerFilterConditionException("The filter condition value must be dict.")
|
||||
for key, value in op_condition.items():
|
||||
if not isinstance(key, str):
|
||||
raise ProfilerFilterConditionException("The filter key must be str")
|
||||
if not isinstance(value, list):
|
||||
raise ProfilerFilterConditionException("The filter value must be list")
|
||||
if key not in filter_key:
|
||||
raise ProfilerFilterConditionException("The filter key must in {}.".format(filter_key))
|
||||
for item in value:
|
||||
if not isinstance(item, value_type):
|
||||
raise ProfilerFilterConditionException(
|
||||
"The item in filter value must be {}.".format(value_type_msg)
|
||||
)
|
||||
|
||||
|
||||
def validate_filter_condition(search_condition):
|
||||
"""
|
||||
Verify the filter_condition in search_condition is valid or not.
|
||||
|
||||
Args:
|
||||
search_condition (dict): The search condition.
|
||||
|
||||
Raises:
|
||||
ProfilerFilterConditionException: If the filter_condition param in search_condition is invalid.
|
||||
"""
|
||||
filter_condition = search_condition.get("filter_condition")
|
||||
if not isinstance(filter_condition, dict):
|
||||
raise ProfilerFilterConditionException("The filter condition must be dict.")
|
||||
if filter_condition:
|
||||
if "op_type" in filter_condition:
|
||||
op_type_condition = filter_condition.get("op_type")
|
||||
validate_op_filter_condition(op_type_condition)
|
||||
if "op_name" in filter_condition:
|
||||
op_name_condition = filter_condition.get("op_name")
|
||||
validate_op_filter_condition(op_name_condition)
|
||||
if "op_type" not in filter_condition and "op_name" not in filter_condition:
|
||||
raise ProfilerFilterConditionException("The key of filter_condition is not support")
|
||||
|
||||
|
||||
def validate_and_set_job_id_env(job_id_env):
|
||||
"""
|
||||
Validate the job id and set it in environment.
|
||||
|
||||
Args:
|
||||
job_id_env (str): The id that to be set in environment parameter `JOB_ID`.
|
||||
|
||||
Returns:
|
||||
int, the valid job id env.
|
||||
"""
|
||||
if job_id_env is None:
|
||||
return job_id_env
|
||||
# get job_id_env in int type
|
||||
valid_id = to_int(job_id_env, 'job_id_env')
|
||||
# check the range of valid_id
|
||||
if valid_id and 255 < valid_id < sys.maxsize:
|
||||
os.environ['JOB_ID'] = job_id_env
|
||||
else:
|
||||
log.warning("Invalid job_id_env %s. The value should be int and between 255 and %s. Use"
|
||||
"default job id env instead.",
|
||||
job_id_env, sys.maxsize)
|
||||
return valid_id
|
||||
|
||||
|
||||
def validate_ui_proc(proc_name):
|
||||
"""
|
||||
Validate proc name in restful request.
|
||||
|
||||
Args:
|
||||
proc_name (str): The proc name to query. Acceptable value is in
|
||||
[`iteration_interval`, `fp_and_bp`, `tail`].
|
||||
|
||||
Raises:
|
||||
ProfilerParamValueErrorException: If the proc_name is invalid.
|
||||
"""
|
||||
accept_names = ['iteration_interval', 'fp_and_bp', 'tail']
|
||||
if proc_name not in accept_names:
|
||||
log.error("Invalid proc_name. The proc_name for restful api is in %s", accept_names)
|
||||
raise ProfilerParamValueErrorException(f'proc_name should be in {accept_names}.')
|
||||
|
||||
|
||||
def validate_minddata_pipeline_condition(condition):
|
||||
"""
|
||||
Verify the minddata pipeline search condition is valid or not.
|
||||
|
||||
Args:
|
||||
condition (dict): The minddata pipeline search condition.
|
||||
|
||||
Raises:
|
||||
ProfilerParamTypeErrorException: If the type of the search condition is
|
||||
invalid.
|
||||
ProfilerDeviceIdException: If the device_id param in the search
|
||||
condition is invalid.
|
||||
ProfilerGroupConditionException: If the group_condition param in the
|
||||
search condition is invalid.
|
||||
ProfilerSortConditionException: If the sort_condition param in the
|
||||
search condition is invalid.
|
||||
ProfilerFilterConditionException: If the filter_condition param in the
|
||||
search condition is invalid.
|
||||
"""
|
||||
if not isinstance(condition, dict):
|
||||
log.error("Invalid condition type, it should be dict.")
|
||||
raise ProfilerParamTypeErrorException(
|
||||
"Invalid condition type, it should be dict."
|
||||
)
|
||||
|
||||
if "device_id" in condition:
|
||||
device_id = condition.get("device_id")
|
||||
if not isinstance(device_id, str):
|
||||
raise ProfilerDeviceIdException(
|
||||
"Invalid device_id type, it should be str."
|
||||
)
|
||||
|
||||
if "group_condition" in condition:
|
||||
validate_group_condition(condition)
|
||||
|
||||
if "sort_condition" in condition:
|
||||
validate_sort_condition(condition, MINDDATA_PIPELINE_COL)
|
||||
|
||||
if "filter_condition" in condition:
|
||||
filter_condition = condition.get('filter_condition')
|
||||
if not isinstance(filter_condition, dict):
|
||||
raise ProfilerFilterConditionException(
|
||||
"The filter condition must be dict."
|
||||
)
|
||||
for key, value in filter_condition.items():
|
||||
if key == 'op_id':
|
||||
validate_op_filter_condition(
|
||||
value, value_type=int, value_type_msg='int'
|
||||
)
|
||||
elif key == 'op_type':
|
||||
validate_op_filter_condition(value)
|
||||
elif key == 'is_display_op_detail':
|
||||
if not isinstance(value, bool):
|
||||
raise ProfilerFilterConditionException(
|
||||
"The condition must be bool."
|
||||
)
|
||||
else:
|
||||
raise ProfilerFilterConditionException(
|
||||
"The key {} of filter_condition is not support.".format(key)
|
||||
)
|
|
@ -0,0 +1,60 @@
|
|||
# Copyright 2019 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Validate the input path."""
|
||||
import os
|
||||
|
||||
|
||||
def validate_and_normalize_path(
|
||||
path,
|
||||
check_absolute_path=False,
|
||||
allow_parent_dir=False,
|
||||
):
|
||||
"""
|
||||
Validates path and returns its normalized form.
|
||||
|
||||
If path has a valid scheme, treat path as url, otherwise consider path a
|
||||
unix local path.
|
||||
|
||||
Note:
|
||||
File scheme (rfc8089) is currently not supported.
|
||||
|
||||
Args:
|
||||
path (str): Path to be normalized.
|
||||
check_absolute_path (bool): Whether check path scheme is supported.
|
||||
allow_parent_dir (bool): Whether allow parent dir in path.
|
||||
|
||||
Returns:
|
||||
str, normalized path.
|
||||
"""
|
||||
if not path:
|
||||
raise RuntimeError("The path is invalid!")
|
||||
|
||||
path_str = str(path)
|
||||
if not allow_parent_dir:
|
||||
path_components = path_str.split("/")
|
||||
if ".." in path_components:
|
||||
raise RuntimeError("The path is invalid!")
|
||||
|
||||
# path does not have valid schema, treat it as unix local path.
|
||||
if check_absolute_path:
|
||||
if not path_str.startswith("/"):
|
||||
raise RuntimeError("The path is invalid!")
|
||||
try:
|
||||
# most unix systems allow
|
||||
normalized_path = os.path.realpath(path)
|
||||
except ValueError:
|
||||
raise RuntimeError("The path is invalid!")
|
||||
|
||||
return normalized_path
|
|
@ -0,0 +1,14 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
|
@ -0,0 +1,175 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
The parser for AI CPU preprocess data.
|
||||
"""
|
||||
import os
|
||||
|
||||
from mindspore.profiler.common.util import fwrite_format, get_file_join_name
|
||||
from mindspore import log as logger
|
||||
|
||||
|
||||
class DataPreProcessParser:
|
||||
"""
|
||||
The Parser for AI CPU preprocess data.
|
||||
|
||||
Args:
|
||||
input_path(str): The profiling job path.
|
||||
output_filename(str): The output data path and name.
|
||||
|
||||
"""
|
||||
|
||||
_source_file_target = 'DATA_PREPROCESS.dev.AICPU.'
|
||||
_dst_file_title = 'title:DATA_PREPROCESS AICPU'
|
||||
_dst_file_column_title = ['serial_number', 'node_type_name', 'total_time(ms)',
|
||||
'dispatch_time(ms)', 'run_start', 'run_end']
|
||||
_ms_unit = 1000
|
||||
|
||||
def __init__(self, input_path, output_filename):
|
||||
self._input_path = input_path
|
||||
self._output_filename = output_filename
|
||||
self._source_file_name = self._get_source_file()
|
||||
self._ms_kernel_flag = 3
|
||||
self._other_kernel_flag = 6
|
||||
self._thread_flag = 7
|
||||
self._ms_kernel_run_end_index = 2
|
||||
self._other_kernel_run_end_index = 5
|
||||
self._result_list = []
|
||||
self._min_cycle_counter = float('inf')
|
||||
|
||||
def _get_source_file(self):
|
||||
"""Get log file name, which was created by ada service."""
|
||||
file_name = get_file_join_name(self._input_path, self._source_file_target)
|
||||
if not file_name:
|
||||
data_path = os.path.join(self._input_path, "data")
|
||||
file_name = get_file_join_name(data_path, self._source_file_target)
|
||||
return file_name
|
||||
|
||||
def _get_kernel_result(self, number, node_list, thread_list):
|
||||
"""Get the profiling data form different aicpu kernel"""
|
||||
try:
|
||||
if len(node_list) == self._ms_kernel_flag and len(thread_list) == self._thread_flag:
|
||||
node_type_name = node_list[0].split(':')[-1]
|
||||
run_end_index = self._ms_kernel_run_end_index
|
||||
elif len(node_list) == self._other_kernel_flag and len(thread_list) == self._thread_flag:
|
||||
node_type_name = node_list[0].split(':')[-1].split('/')[-1].split('-')[0]
|
||||
run_end_index = self._other_kernel_run_end_index
|
||||
else:
|
||||
logger.warning("the data format can't support 'node_list':%s", str(node_list))
|
||||
return None
|
||||
|
||||
run_start = node_list[1].split(':')[-1].split(' ')[0]
|
||||
run_end = node_list[run_end_index].split(':')[-1].split(' ')[0]
|
||||
total_time = float(thread_list[-1].split('=')[-1].split()[0]) / self._ms_unit
|
||||
dispatch_time = float(thread_list[-2].split('=')[-1].split()[0]) / self._ms_unit
|
||||
|
||||
return [number, node_type_name, total_time, dispatch_time,
|
||||
run_start, run_end]
|
||||
except IndexError as e:
|
||||
logger.error(e)
|
||||
return None
|
||||
|
||||
def execute(self):
|
||||
"""Execute the parser, get result data, and write it to the output file."""
|
||||
|
||||
if not os.path.exists(self._source_file_name):
|
||||
logger.info("Did not find the aicpu profiling source file")
|
||||
return
|
||||
|
||||
with open(self._source_file_name, 'rb') as ai_cpu_data:
|
||||
ai_cpu_str = str(ai_cpu_data.read().replace(b'\n\x00', b' ___ ')
|
||||
.replace(b'\x00', b' ___ '))[2:-1]
|
||||
ai_cpu_lines = ai_cpu_str.split(" ___ ")
|
||||
|
||||
result_list = list()
|
||||
ai_cpu_total_time_summary = 0
|
||||
# Node serial number.
|
||||
serial_number = 1
|
||||
for i in range(len(ai_cpu_lines) - 1):
|
||||
node_line = ai_cpu_lines[i]
|
||||
thread_line = ai_cpu_lines[i + 1]
|
||||
if "Node" in node_line and "Thread" in thread_line:
|
||||
# Get the node data from node_line
|
||||
node_list = node_line.split(',')
|
||||
thread_list = thread_line.split(',')
|
||||
result = self._get_kernel_result(serial_number, node_list, thread_list)
|
||||
|
||||
if result is None:
|
||||
continue
|
||||
|
||||
result_list.append(result)
|
||||
# Calculate the total time.
|
||||
total_time = result[2]
|
||||
ai_cpu_total_time_summary += total_time
|
||||
# Increase node serial number.
|
||||
serial_number += 1
|
||||
elif "Node" in node_line and "Thread" not in thread_line:
|
||||
node_type_name = node_line.split(',')[0].split(':')[-1]
|
||||
logger.warning("The node type:%s cannot find thread data", node_type_name)
|
||||
|
||||
if result_list:
|
||||
ai_cpu_total_time = format(ai_cpu_total_time_summary, '.6f')
|
||||
result_list.append(["AI CPU Total Time(ms):", ai_cpu_total_time])
|
||||
fwrite_format(self._output_filename, " ".join(self._dst_file_column_title), is_start=True, is_print=True)
|
||||
fwrite_format(self._output_filename, result_list, is_print=True)
|
||||
|
||||
# For timeline display.
|
||||
self._result_list = result_list
|
||||
|
||||
def query_aicpu_data(self):
|
||||
"""
|
||||
Get execution time of AI CPU operator.
|
||||
|
||||
Returns:
|
||||
a dict, the metadata of AI CPU operator execution time.
|
||||
"""
|
||||
stream_id = 0 # Default stream id for AI CPU.
|
||||
pid = 9000 # Default pid for AI CPU.
|
||||
factor = 1000 # Convert time unit from 1us to 1ms
|
||||
total_time = 0
|
||||
min_cycle_counter = float('inf')
|
||||
aicpu_info = []
|
||||
op_count_list = []
|
||||
for aicpu_item in self._result_list:
|
||||
if "AI CPU Total Time(ms):" in aicpu_item:
|
||||
total_time = aicpu_item[-1]
|
||||
continue
|
||||
|
||||
op_name = aicpu_item[1]
|
||||
start_time = float(aicpu_item[4]) / factor
|
||||
min_cycle_counter = min(min_cycle_counter, start_time)
|
||||
end_time = float(aicpu_item[5]) / factor
|
||||
duration = end_time - start_time
|
||||
aicpu_info.append([op_name, stream_id, start_time, duration, pid])
|
||||
|
||||
# Record the number of operator types.
|
||||
if op_name not in op_count_list:
|
||||
op_count_list.append(op_name)
|
||||
|
||||
self._min_cycle_counter = min_cycle_counter
|
||||
aicpu_dict = {
|
||||
'info': aicpu_info,
|
||||
'total_time': float(total_time),
|
||||
'op_exe_times': len(aicpu_info),
|
||||
'num_of_ops': len(op_count_list),
|
||||
'num_of_streams': 1
|
||||
}
|
||||
|
||||
return aicpu_dict
|
||||
|
||||
@property
|
||||
def min_cycle_counter(self):
|
||||
"""Get minimum cycle counter in AI CPU."""
|
||||
return self._min_cycle_counter
|
|
@ -0,0 +1,113 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""The container of metadata used in profiler parser."""
|
||||
|
||||
|
||||
class HWTSContainer:
|
||||
"""
|
||||
HWTS output container.
|
||||
|
||||
Args:
|
||||
split_list (list): The split list of metadata in HWTS output file.
|
||||
"""
|
||||
def __init__(self, split_list):
|
||||
self._op_name = ''
|
||||
self._duration = None
|
||||
self._status = split_list[0]
|
||||
self._task_id = split_list[6]
|
||||
self._cycle_counter = float(split_list[7])
|
||||
self._stream_id = split_list[8]
|
||||
|
||||
@property
|
||||
def status(self):
|
||||
"""Get the status of the operator, i.e. Start or End."""
|
||||
return self._status
|
||||
|
||||
@property
|
||||
def task_id(self):
|
||||
"""Get the task id of the operator."""
|
||||
return self._task_id
|
||||
|
||||
@property
|
||||
def cycle_counter(self):
|
||||
"""Get the cycle counter."""
|
||||
return self._cycle_counter
|
||||
|
||||
@property
|
||||
def stream_id(self):
|
||||
"""Get the stream id of the operator."""
|
||||
return self._stream_id
|
||||
|
||||
@property
|
||||
def op_name(self):
|
||||
"""Get the name of the operator."""
|
||||
return self._op_name
|
||||
|
||||
@op_name.setter
|
||||
def op_name(self, name):
|
||||
"""Set the name of the operator."""
|
||||
self._op_name = name
|
||||
|
||||
@property
|
||||
def duration(self):
|
||||
"""Get the duration of the operator execution."""
|
||||
return self._duration
|
||||
|
||||
@duration.setter
|
||||
def duration(self, value):
|
||||
"""Set the duration of the operator execution."""
|
||||
self._duration = value
|
||||
|
||||
|
||||
class TimelineContainer:
|
||||
"""
|
||||
A container of operator computation metadata.
|
||||
|
||||
Args:
|
||||
split_list (list): The split list of metadata in op_compute output file.
|
||||
"""
|
||||
def __init__(self, split_list):
|
||||
self._op_name = split_list[0]
|
||||
self._stream_id = int(split_list[1])
|
||||
self._start_time = float(split_list[2])
|
||||
self._duration = float(split_list[3])
|
||||
self._pid = None
|
||||
if len(split_list) == 5:
|
||||
self._pid = int(split_list[4])
|
||||
|
||||
@property
|
||||
def op_name(self):
|
||||
"""Get the name of the operator."""
|
||||
return self._op_name
|
||||
|
||||
@property
|
||||
def stream_id(self):
|
||||
"""Get the stream id of the operator."""
|
||||
return self._stream_id
|
||||
|
||||
@property
|
||||
def start_time(self):
|
||||
"""Get the execution start time of the operator."""
|
||||
return self._start_time
|
||||
|
||||
@property
|
||||
def duration(self):
|
||||
"""Get the duration of the operator execution."""
|
||||
return self._duration
|
||||
|
||||
@property
|
||||
def pid(self):
|
||||
"""Get the pid of the operator execution."""
|
||||
return self._pid
|
|
@ -0,0 +1,595 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Thr parser for parsing framework files."""
|
||||
import csv
|
||||
import enum
|
||||
import json
|
||||
import os
|
||||
import re
|
||||
|
||||
from mindspore.profiler.common.exceptions.exceptions import \
|
||||
ProfilerPathErrorException, ProfilerDirNotFoundException, \
|
||||
ProfilerFileNotFoundException, ProfilerDeviceIdMismatchException, \
|
||||
ProfilerRawFileException, ProfilerParamValueErrorException
|
||||
from mindspore.profiler.common.validator.validate_path import \
|
||||
validate_and_normalize_path
|
||||
|
||||
|
||||
class VmDataType(enum.IntEnum):
|
||||
"""Definition of vm data type."""
|
||||
NUMBER_TYPE_BEGIN = 26
|
||||
NUMBER_TYPE_BOOL = 27
|
||||
NUMBER_TYPE_INT = 28
|
||||
NUMBER_TYPE_INT8 = 29
|
||||
NUMBER_TYPE_INT16 = 30
|
||||
NUMBER_TYPE_INT32 = 31
|
||||
NUMBER_TYPE_INT64 = 32
|
||||
NUMBER_TYPE_UINT = 33
|
||||
NUMBER_TYPE_UINT8 = 34
|
||||
NUMBER_TYPE_UINT16 = 35
|
||||
NUMBER_TYPE_UINT32 = 36
|
||||
NUMBER_TYPE_UINT64 = 37
|
||||
NUMBER_TYPE_FLOAT = 38
|
||||
NUMBER_TYPE_FLOAT16 = 39
|
||||
NUMBER_TYPE_FLOAT32 = 40
|
||||
NUMBER_TYPE_FLOAT64 = 41
|
||||
NUMBER_TYPE_END = 42
|
||||
|
||||
@classmethod
|
||||
def get_data_type_name(cls, num):
|
||||
"""
|
||||
Get the name of data type by enum number.
|
||||
|
||||
Args:
|
||||
num (int): Enum number.
|
||||
|
||||
Returns:
|
||||
str, the name of data type.
|
||||
"""
|
||||
data_type = cls._value2member_map_.get(num)
|
||||
return 'UNKNOWN' if data_type is None else data_type.name
|
||||
|
||||
|
||||
class GeDataType(enum.IntEnum):
|
||||
"""Definition of ge data type."""
|
||||
DT_FLOAT = 0
|
||||
DT_FLOAT16 = 1
|
||||
DT_INT8 = 2
|
||||
DT_INT16 = 6
|
||||
DT_UINT16 = 7
|
||||
DT_UINT8 = 4
|
||||
DT_INT32 = 3
|
||||
DT_INT64 = 9
|
||||
DT_UINT32 = 8
|
||||
DT_UINT64 = 10
|
||||
DT_BOOL = 12
|
||||
DT_DOUBLE = 11
|
||||
DT_STRING = 13
|
||||
DT_DUAL_SUB_INT8 = 14
|
||||
DT_DUAL_SUB_UINT8 = 15
|
||||
DT_COMPLEX64 = 16
|
||||
DT_COMPLEX128 = 17
|
||||
DT_QINT8 = 18
|
||||
DT_QINT16 = 19
|
||||
DT_QINT32 = 20
|
||||
DT_QUINT8 = 21
|
||||
DT_QUINT16 = 22
|
||||
DT_RESOURCE = 23
|
||||
DT_STRING_REF = 24
|
||||
DT_DUAL = 25
|
||||
DT_UNDEFINED = 26
|
||||
|
||||
@classmethod
|
||||
def get_data_type_name(cls, num):
|
||||
"""
|
||||
Get the name of data type by enum number.
|
||||
|
||||
Args:
|
||||
num (int): Enum number.
|
||||
|
||||
Returns:
|
||||
str, the name of data type.
|
||||
"""
|
||||
data_type = cls._value2member_map_.get(num)
|
||||
return 'UNKNOWN' if data_type is None else data_type.name
|
||||
|
||||
|
||||
class GeFormat(enum.IntEnum):
|
||||
"""Definition of ge format type."""
|
||||
FORMAT_NCHW = 0
|
||||
FORMAT_NHWC = 1
|
||||
FORMAT_ND = 2
|
||||
FORMAT_NC1HWC0 = 3
|
||||
FORMAT_FRACTAL_Z = 4
|
||||
FORMAT_NC1C0HWPAD = 5
|
||||
FORMAT_NHWC1C0 = 6
|
||||
FORMAT_FSR_NCHW = 7
|
||||
FORMAT_FRACTAL_DECONV = 8
|
||||
FORMAT_C1HWNC0 = 9
|
||||
FORMAT_FRACTAL_DECONV_TRANSPOSE = 10
|
||||
FORMAT_FRACTAL_DECONV_SP_STRIDE_TRANS = 11
|
||||
FORMAT_NC1HWC0_C04 = 12
|
||||
FORMAT_FRACTAL_Z_C04 = 13
|
||||
FORMAT_CHWN = 14
|
||||
FORMAT_FRACTAL_DECONV_SP_STRIDE8_TRANS = 15
|
||||
FORMAT_HWCN = 16
|
||||
FORMAT_NC1KHKWHWC0 = 17
|
||||
FORMAT_BN_WEIGHT = 18
|
||||
FORMAT_FILTER_HWCK = 19
|
||||
FORMAT_HASHTABLE_LOOKUP_LOOKUPS = 20
|
||||
FORMAT_HASHTABLE_LOOKUP_KEYS = 21
|
||||
FORMAT_HASHTABLE_LOOKUP_VALUE = 22
|
||||
FORMAT_HASHTABLE_LOOKUP_OUTPUT = 23
|
||||
FORMAT_HASHTABLE_LOOKUP_HITS = 24
|
||||
FORMAT_C1HWNCOC0 = 25
|
||||
FORMAT_MD = 26
|
||||
FORMAT_NDHWC = 27
|
||||
FORMAT_FRACTAL_ZZ = 28
|
||||
FORMAT_FRACTAL_NZ = 29
|
||||
FORMAT_NCDHW = 30
|
||||
FORMAT_DHWCN = 31
|
||||
FORMAT_NDC1HWC0 = 32
|
||||
FORMAT_FRACTAL_Z_3D = 33
|
||||
FORMAT_CN = 34
|
||||
FORMAT_NC = 35
|
||||
FORMAT_DHWNC = 36
|
||||
FORMAT_FRACTAL_Z_3D_TRANSPOSE = 37
|
||||
FORMAT_RESERVED = 38
|
||||
FORMAT_ALL = 39
|
||||
|
||||
@classmethod
|
||||
def get_format_name(cls, num):
|
||||
"""
|
||||
Get the name of format type by enum number.
|
||||
|
||||
Args:
|
||||
num (int): Enum number.
|
||||
|
||||
Returns:
|
||||
str, the name of format type.
|
||||
"""
|
||||
format_type = cls._value2member_map_.get(num)
|
||||
return 'UNKNOWN' if format_type is None else format_type.name
|
||||
|
||||
|
||||
class FrameworkParser:
|
||||
"""
|
||||
Thr parser for parsing framework files.
|
||||
|
||||
Args:
|
||||
profiling_id (str): The profiling ID.
|
||||
device_id (str): The device ID.
|
||||
output_path (str): The directory of the parsed file. Default: `./`.
|
||||
"""
|
||||
_raw_data_dir = '/var/log/npu/profiling'
|
||||
_regex_framework = r'Framework\.host\.(?P<data_type>.+)\.(?P<device_id>\d).+'
|
||||
_regex_framework_in_data = r'Framework\.host\.(?P<data_type>.+)\.' \
|
||||
r'(?P<device_id>\d)\.(?P<profiling_id>[a-zA-Z0-9]+).+'
|
||||
_col_names = [
|
||||
'task_id', 'stream_id', 'block_dim', 'full_op_name', 'op_name',
|
||||
'op_type', 'subgraph', 'op_info'
|
||||
]
|
||||
_graph_attr_name = [
|
||||
'input_format', 'input_data_type', 'input_shape', 'output_format',
|
||||
'output_data_type', 'output_shape'
|
||||
]
|
||||
|
||||
# if the task id is less than the task id threshold, The combination of
|
||||
# task id and Stream id represents one operator, else the task id represents
|
||||
# one operator
|
||||
_task_id_threshold = 25000
|
||||
|
||||
def __init__(self, profiling_id, device_id, output_path='./'):
|
||||
self._profiling_path = self._get_raw_profiling_path(profiling_id)
|
||||
self._backend_type = None
|
||||
self._framework_path = {'graph': [], 'task': [], 'point': []}
|
||||
self._search_file(profiling_id, device_id)
|
||||
self._device_id = device_id
|
||||
self._save_path = self._get_save_path(device_id, output_path)
|
||||
self._task_id_full_op_name_dict = {}
|
||||
self._task_cache = {}
|
||||
self._point_info = {}
|
||||
self._parse_task_files()
|
||||
self._parse_point_files()
|
||||
|
||||
@property
|
||||
def save_path(self):
|
||||
"""
|
||||
The property of save path.
|
||||
|
||||
Returns:
|
||||
str, the save path.
|
||||
"""
|
||||
return self._save_path
|
||||
|
||||
@property
|
||||
def point_info(self):
|
||||
"""
|
||||
The property of the framework point information.
|
||||
|
||||
Returns:
|
||||
dict, the framework point information.
|
||||
"""
|
||||
return self._point_info
|
||||
|
||||
def to_task_id_full_op_name_dict(self):
|
||||
"""
|
||||
Get the task id and full operator name dict.
|
||||
|
||||
Returns:
|
||||
dict, the task id and full operator name dict.
|
||||
"""
|
||||
return self._task_id_full_op_name_dict
|
||||
|
||||
def parse(self):
|
||||
"""Parse the framework files."""
|
||||
self._parse_graph_files_and_save(self._task_cache)
|
||||
del self._task_cache
|
||||
|
||||
def check_op_name(self, op_name, is_prefix=True):
|
||||
"""
|
||||
Check whether the operator name exists.
|
||||
|
||||
Args:
|
||||
op_name (str): The operator name or operator name prefix.
|
||||
is_prefix (bool): `True` if the op_name is prefix, else `False`.
|
||||
Default: True.
|
||||
|
||||
Returns:
|
||||
bool, `True` if the operator name does exist in framework file, else
|
||||
`False`.
|
||||
"""
|
||||
if not op_name:
|
||||
raise ProfilerParamValueErrorException('The op_name should exist.')
|
||||
for full_op_name in self._task_id_full_op_name_dict.values():
|
||||
if full_op_name:
|
||||
if is_prefix and full_op_name.startswith(op_name):
|
||||
return True
|
||||
if not is_prefix and op_name == full_op_name:
|
||||
return True
|
||||
return False
|
||||
|
||||
def _get_raw_profiling_path(self, profiling_id):
|
||||
"""
|
||||
Get raw profiling path.
|
||||
|
||||
Args:
|
||||
profiling_id (str): The profiling ID.
|
||||
|
||||
Returns:
|
||||
str, the raw profiling path.
|
||||
|
||||
Raises:
|
||||
ProfilerPathErrorException: If the profiling path is invalid.
|
||||
ProfilerDirNotFoundException: If the profiling dir is not found.
|
||||
"""
|
||||
profiling_path = os.path.join(self._raw_data_dir, profiling_id)
|
||||
try:
|
||||
profiling_path = validate_and_normalize_path(profiling_path)
|
||||
except RuntimeError:
|
||||
raise ProfilerPathErrorException('Profiling path is invalid.')
|
||||
if not os.path.isdir(profiling_path):
|
||||
raise ProfilerDirNotFoundException(profiling_path)
|
||||
return profiling_path
|
||||
|
||||
def _search_file(self, profiling_id, device_id):
|
||||
"""
|
||||
Search all framework files in raw profiling path.
|
||||
|
||||
Args:
|
||||
profiling_id (str): The profiling ID.
|
||||
device_id (str): The device ID.
|
||||
|
||||
Raises:
|
||||
ProfilerFileNotFoundException: If the framework files are not found.
|
||||
"""
|
||||
# first search in the JOB dir, and if not, search in the sub directory
|
||||
# in the JOB
|
||||
self._search_file_from_job_path(device_id, search_in_sub_path=False)
|
||||
if self._backend_type is None:
|
||||
self._search_file_from_job_path(device_id, search_in_sub_path=True)
|
||||
self._search_file_from_data_path(profiling_id, device_id)
|
||||
|
||||
if self._backend_type is None:
|
||||
raise ProfilerFileNotFoundException('Framework')
|
||||
self._framework_path['graph'].sort()
|
||||
self._framework_path['task'].sort()
|
||||
|
||||
def _search_file_from_job_path(self, device_id, search_in_sub_path=False):
|
||||
"""
|
||||
Search framework files from job path.
|
||||
|
||||
Args:
|
||||
device_id (str): The device ID.
|
||||
search_in_sub_path (bool): `True` if search file in profiling dir,
|
||||
else search in profiling sub dir. Default: False.
|
||||
|
||||
Raises:
|
||||
ProfilerRawFileException: If the framework file type is inconsistent.
|
||||
ProfilerDeviceIdMismatchException: If the device id is mismatch
|
||||
with framework in the raw dir.
|
||||
"""
|
||||
profiling_dir = os.path.join(self._profiling_path, 'data') \
|
||||
if search_in_sub_path else self._profiling_path
|
||||
if not os.path.isdir(profiling_dir):
|
||||
return
|
||||
|
||||
files = os.listdir(profiling_dir)
|
||||
for file in files:
|
||||
pattern = re.search(self._regex_framework, file)
|
||||
if not pattern or file.endswith('.done'):
|
||||
continue
|
||||
attrs = pattern.groupdict()
|
||||
|
||||
device_id_in_path = attrs.get('device_id')
|
||||
if device_id_in_path != device_id:
|
||||
raise ProfilerDeviceIdMismatchException()
|
||||
|
||||
data_type = attrs.get('data_type')
|
||||
if data_type.startswith('vm.'):
|
||||
if self._backend_type and self._backend_type != 'vm':
|
||||
raise ProfilerRawFileException('Backend type is inconsistent.')
|
||||
self._backend_type = 'vm'
|
||||
data_type = data_type.split('.')[1]
|
||||
else:
|
||||
if self._backend_type and self._backend_type != 'ge':
|
||||
raise ProfilerRawFileException('Backend type is inconsistent.')
|
||||
self._backend_type = 'ge'
|
||||
if data_type.startswith('graph_desc_info'):
|
||||
self._framework_path['graph'].append(
|
||||
os.path.join(profiling_dir, file)
|
||||
)
|
||||
elif data_type.startswith('task_desc_info'):
|
||||
self._framework_path['task'].append(
|
||||
os.path.join(profiling_dir, file)
|
||||
)
|
||||
elif data_type.startswith('point'):
|
||||
self._framework_path['point'].append(
|
||||
os.path.join(profiling_dir, file)
|
||||
)
|
||||
|
||||
def _search_file_from_data_path(self, profiling_id, device_id):
|
||||
"""
|
||||
Search framework files from data path.
|
||||
|
||||
Args:
|
||||
profiling_id (str): The profiling ID.
|
||||
device_id (str): The device ID.
|
||||
|
||||
Raises:
|
||||
ProfilerRawFileException: If the framework file type is inconsistent.
|
||||
ProfilerDeviceIdMismatchException: If the device id is mismatch
|
||||
with framework in the raw dir.
|
||||
"""
|
||||
profiling_data_path = os.path.join(
|
||||
self._raw_data_dir, 'container', device_id, 'data'
|
||||
)
|
||||
if not os.path.isdir(profiling_data_path):
|
||||
return
|
||||
|
||||
files = os.listdir(profiling_data_path)
|
||||
for file in files:
|
||||
pattern = re.search(self._regex_framework_in_data, file)
|
||||
if not pattern or file.endswith('.done') or file.endswith('.zip'):
|
||||
continue
|
||||
attrs = pattern.groupdict()
|
||||
|
||||
profiling_id_in_path = attrs.get('profiling_id')
|
||||
if profiling_id_in_path != profiling_id:
|
||||
continue
|
||||
|
||||
device_id_in_path = attrs.get('device_id')
|
||||
if device_id_in_path != device_id:
|
||||
raise ProfilerDeviceIdMismatchException()
|
||||
|
||||
data_type = attrs.get('data_type')
|
||||
if data_type.startswith('vm.'):
|
||||
if self._backend_type and self._backend_type != 'vm':
|
||||
raise ProfilerRawFileException('Backend type is inconsistent.')
|
||||
self._backend_type = 'vm'
|
||||
data_type = data_type.split('.')[1]
|
||||
else:
|
||||
if self._backend_type and self._backend_type != 'ge':
|
||||
raise ProfilerRawFileException('Backend type is inconsistent.')
|
||||
self._backend_type = 'ge'
|
||||
if data_type.startswith('graph_desc_info'):
|
||||
self._framework_path['graph'].append(
|
||||
os.path.join(profiling_data_path, file)
|
||||
)
|
||||
elif data_type.startswith('task_desc_info'):
|
||||
self._framework_path['task'].append(
|
||||
os.path.join(profiling_data_path, file)
|
||||
)
|
||||
elif data_type.startswith('point'):
|
||||
self._framework_path['point'].append(
|
||||
os.path.join(profiling_data_path, file)
|
||||
)
|
||||
|
||||
def _get_save_path(self, device_id, output_path):
|
||||
"""
|
||||
Get the save path.
|
||||
|
||||
Args:
|
||||
device_id (str): The device ID.
|
||||
output_path (str): The output dir.
|
||||
|
||||
Returns:
|
||||
str, the save path.
|
||||
|
||||
Raises:
|
||||
ProfilerPathErrorException: If the output path is invalid.
|
||||
ProfilerDirNotFoundException: If the output dir is not found.
|
||||
"""
|
||||
try:
|
||||
output_dir = validate_and_normalize_path(output_path)
|
||||
except RuntimeError:
|
||||
raise ProfilerPathErrorException('Output path is invalid.')
|
||||
if not os.path.isdir(output_dir):
|
||||
raise ProfilerDirNotFoundException(output_dir)
|
||||
return os.path.join(
|
||||
output_dir, '_'.join(['framework', 'raw', device_id]) + '.csv'
|
||||
)
|
||||
|
||||
def _parse_task_files(self):
|
||||
"""Parse the framework task files."""
|
||||
for path in self._framework_path['task']:
|
||||
with open(path, 'r') as file:
|
||||
for task_info in file:
|
||||
infos = task_info.strip('\n').split(' ')
|
||||
infos = infos[1:] if len(infos) == 5 else infos
|
||||
# key is op name, values is task id, stream id, block_dim
|
||||
self._task_cache[infos[0]] = [infos[2], infos[3], infos[1]]
|
||||
|
||||
# if the task id is less than the task id threshold, the
|
||||
# stream id and task id correspond to an operator
|
||||
task_id = infos[2]
|
||||
if int(task_id) < self._task_id_threshold:
|
||||
task_id = '_'.join([infos[3], task_id])
|
||||
self._task_id_full_op_name_dict[task_id] = infos[0]
|
||||
|
||||
def _parse_graph_files_and_save(self, task_cache):
|
||||
"""
|
||||
Parse the framework graph files and save the framework information.
|
||||
|
||||
Args:
|
||||
task_cache (dict): The task information cache.
|
||||
"""
|
||||
with open(self._save_path, 'w') as save_file:
|
||||
csv_writer = csv.writer(save_file)
|
||||
csv_writer.writerow(self._col_names)
|
||||
for path in self._framework_path['graph']:
|
||||
with open(path, 'r') as graph_file:
|
||||
for graph_info in graph_file:
|
||||
result = self._parse_one_row_graph_info(graph_info)
|
||||
task_info = task_cache.get(result[0])
|
||||
if task_info:
|
||||
task_info.extend(result)
|
||||
csv_writer.writerow(task_info)
|
||||
del task_cache[result[0]]
|
||||
else:
|
||||
save_info = [None, None, None]
|
||||
save_info.extend(result)
|
||||
csv_writer.writerow(save_info)
|
||||
|
||||
none_list = [None, None, None, None]
|
||||
for key, value in task_cache.items():
|
||||
value.append(key)
|
||||
value.extend(none_list)
|
||||
csv_writer.writerow(value)
|
||||
|
||||
def _parse_one_row_graph_info(self, row_info):
|
||||
"""
|
||||
Parse the graph information in one row.
|
||||
|
||||
Args:
|
||||
row_info (str): One row graph information.
|
||||
|
||||
Returns:
|
||||
list[str], the parsed graph information.
|
||||
"""
|
||||
full_op_name = None
|
||||
op_name = None
|
||||
subgraph_name = None
|
||||
op_type = None
|
||||
op_info = dict()
|
||||
cur_op_info_key = None
|
||||
|
||||
infos = row_info.strip('\n').split(' ')
|
||||
for info in infos:
|
||||
attr_name, attr_value = info.split(':', 1)
|
||||
if attr_name == 'op_name':
|
||||
full_op_name = attr_value
|
||||
subgraph_name = self._get_subgraph_name(full_op_name)
|
||||
op_name = self._get_op_name(full_op_name, subgraph_name)
|
||||
elif attr_name == 'op_type':
|
||||
op_type = attr_value
|
||||
elif attr_name in ['input_id', 'output_id']:
|
||||
cur_op_info_key = '{}_{}'.format(
|
||||
attr_name.split('_')[0], attr_value
|
||||
)
|
||||
op_info[cur_op_info_key] = dict()
|
||||
elif attr_name in self._graph_attr_name:
|
||||
op_attr = attr_name.split('_', 1)[1]
|
||||
if op_attr == 'shape':
|
||||
attr_value = attr_value.strip('"')
|
||||
if self._backend_type == 'vm':
|
||||
if op_attr == 'data_type':
|
||||
attr_value = VmDataType.get_data_type_name(
|
||||
int(attr_value)
|
||||
)
|
||||
else:
|
||||
if op_attr == 'data_type':
|
||||
attr_value = GeDataType.get_data_type_name(
|
||||
int(attr_value)
|
||||
)
|
||||
elif op_attr == 'format':
|
||||
attr_value = GeFormat.get_format_name(int(attr_value))
|
||||
|
||||
op_info[cur_op_info_key][op_attr] = attr_value
|
||||
|
||||
# the list info are full_op_name, op_name, op_type, subgraph, op_info
|
||||
return [full_op_name, op_name, op_type, subgraph_name,
|
||||
json.dumps(op_info)]
|
||||
|
||||
def _get_subgraph_name(self, full_op_name):
|
||||
"""
|
||||
Get subgraph name.
|
||||
|
||||
Args:
|
||||
full_op_name (str): The full operator name.
|
||||
|
||||
Returns:
|
||||
str, the subgraph name.
|
||||
"""
|
||||
subgraph_name = full_op_name.split('/', 1)[0]
|
||||
if subgraph_name in ['Default', 'Gradients']:
|
||||
return subgraph_name
|
||||
return None
|
||||
|
||||
def _get_op_name(self, full_op_name, subgraph_name):
|
||||
"""
|
||||
Get operator name.
|
||||
|
||||
Args:
|
||||
full_op_name (str): The full operator name.
|
||||
subgraph_name (str): The subgraph name.
|
||||
|
||||
Returns:
|
||||
str, the operator name.
|
||||
"""
|
||||
if subgraph_name is None:
|
||||
return full_op_name
|
||||
|
||||
if self._backend_type == 'vm':
|
||||
return full_op_name.split('/')[-1]
|
||||
|
||||
strs = full_op_name.split(subgraph_name + '/')
|
||||
op_name = None
|
||||
for name_str in strs:
|
||||
if not name_str:
|
||||
continue
|
||||
if op_name is None:
|
||||
op_name = name_str.split('/')[-1]
|
||||
else:
|
||||
op_name = '+'.join([op_name, name_str.split('/')[-1]])
|
||||
return op_name
|
||||
|
||||
def _parse_point_files(self):
|
||||
"""Parse the framework point files."""
|
||||
for path in self._framework_path['point']:
|
||||
with open(path, 'r') as file:
|
||||
for point_info in file:
|
||||
infos = point_info.strip('\n').split(' ')
|
||||
self._point_info[int(infos[0])] = infos[1]
|
|
@ -0,0 +1,109 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""The parser for hwts log file."""
|
||||
import os
|
||||
import struct
|
||||
from mindspore.profiler.common.util import fwrite_format, get_file_join_name
|
||||
from mindspore import log as logger
|
||||
|
||||
|
||||
class HWTSLogParser:
|
||||
"""
|
||||
The Parser for hwts log files.
|
||||
|
||||
Args:
|
||||
input_path (str): The profiling job path. Such as: '/var/log/npu/profiling/JOBAIFGJEJFEDCBAEADIFJAAAAAAAAAA".
|
||||
output_filename (str): The output data path and name. Such as: './output_format_data_hwts_0.txt'.
|
||||
"""
|
||||
|
||||
_source_file_target = 'hwts.log.data.45.dev.profiler_default_tag'
|
||||
_dst_file_title = 'title:45 HWTS data'
|
||||
_dst_file_column_title = 'Type cnt Core_ID Block_ID Task_ID Cycle_counter Stream_ID'
|
||||
|
||||
def __init__(self, input_path, output_filename):
|
||||
self._input_path = input_path
|
||||
self._output_filename = output_filename
|
||||
self._source_flie_name = self._get_source_file()
|
||||
|
||||
def _get_source_file(self):
|
||||
"""Get hwts log file name, which was created by ada service."""
|
||||
|
||||
file_name = get_file_join_name(self._input_path, self._source_file_target)
|
||||
if not file_name:
|
||||
data_path = os.path.join(self._input_path, "data")
|
||||
file_name = get_file_join_name(data_path, self._source_file_target)
|
||||
if not file_name:
|
||||
msg = "Fail to find hwts log file, under profiling directory"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
return file_name
|
||||
|
||||
def execute(self):
|
||||
"""
|
||||
Execute the parser, get result data, and write it to the output file.
|
||||
|
||||
Returns:
|
||||
bool, whether succeed to analyse hwts log.
|
||||
"""
|
||||
|
||||
content_format = ['QIIIIIIIIIIII', 'QIIQIIIIIIII', 'IIIIQIIIIIIII']
|
||||
log_type = ['Start of task', 'End of task', 'Start of block', 'End of block', 'Block PMU']
|
||||
|
||||
result_data = ""
|
||||
|
||||
with open(self._source_flie_name, 'rb') as hwts_data:
|
||||
while True:
|
||||
line = hwts_data.read(64)
|
||||
if line:
|
||||
if not line.strip():
|
||||
continue
|
||||
else:
|
||||
break
|
||||
byte_first_four = struct.unpack('BBHHH', line[0:8])
|
||||
byte_first = bin(byte_first_four[0]).replace('0b', '').zfill(8)
|
||||
ms_type = byte_first[-3:]
|
||||
is_warn_res0_ov = byte_first[4]
|
||||
cnt = int(byte_first[0:4], 2)
|
||||
core_id = byte_first_four[1]
|
||||
blk_id, task_id = byte_first_four[3], byte_first_four[4]
|
||||
if ms_type in ['000', '001', '010']: # log type 0,1,2
|
||||
result = struct.unpack(content_format[0], line[8:])
|
||||
syscnt = result[0]
|
||||
stream_id = result[1]
|
||||
elif ms_type == '011': # log type 3
|
||||
result = struct.unpack(content_format[1], line[8:])
|
||||
syscnt = result[0]
|
||||
stream_id = result[1]
|
||||
elif ms_type == '100': # log type 4
|
||||
result = struct.unpack(content_format[2], line[8:])
|
||||
stream_id = result[2]
|
||||
if is_warn_res0_ov == '0':
|
||||
syscnt = result[4]
|
||||
else:
|
||||
syscnt = None
|
||||
else:
|
||||
logger.info("Profiling: invalid hwts log record type %s", ms_type)
|
||||
continue
|
||||
|
||||
if int(task_id) < 25000:
|
||||
task_id = str(stream_id) + "_" + str(task_id)
|
||||
result_data += ("%-14s %-4s %-8s %-9s %-8s %-15s %s\n" %(log_type[int(ms_type, 2)], cnt, core_id,
|
||||
blk_id, task_id, syscnt, stream_id))
|
||||
|
||||
fwrite_format(self._output_filename, data_source=self._dst_file_title, is_start=True)
|
||||
fwrite_format(self._output_filename, data_source=self._dst_file_column_title)
|
||||
fwrite_format(self._output_filename, data_source=result_data)
|
||||
|
||||
return True
|
|
@ -0,0 +1,581 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""The integrator for integrating parsed profiling files."""
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
from decimal import Decimal
|
||||
|
||||
from mindspore import log as logger
|
||||
from mindspore.profiler.common.exceptions.exceptions import ProfilerIOException, \
|
||||
ProfilerFileNotFoundException, ProfilerRawFileException
|
||||
from mindspore.profiler.common.util import query_latest_trace_time_file, to_int, to_millisecond
|
||||
from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
|
||||
from mindspore.profiler.parser.container import TimelineContainer
|
||||
|
||||
SIZE_LIMIT = 20 * 1024 * 1024 # 20MB
|
||||
|
||||
class Integrator:
|
||||
"""
|
||||
The integrator for integrating parsed profiling files.
|
||||
|
||||
Args:
|
||||
profiling_dir (str): The directory where the parsed profiling files are
|
||||
located.
|
||||
device_id (str): The device ID.
|
||||
"""
|
||||
_file_name_aicore_detail_time = 'output_op_compute_time_{}.txt'
|
||||
_file_name_aicpu_time = 'output_data_preprocess_aicpu_{}.txt'
|
||||
_file_name_framework = 'framework_raw_{}.csv'
|
||||
_header_aicore_type = ['op_type', 'execution_time', 'execution_frequency',
|
||||
'percent']
|
||||
_header_aicore_detail = ['full_op_name', 'execution_time']
|
||||
_header_aicpu = ['serial_number', 'op_type', 'total_time', 'dispatch_time',
|
||||
'run_start', 'run_end']
|
||||
|
||||
_file_name_aicore_type_time = 'aicore_intermediate_{}_type.csv'
|
||||
_file_name_aicore_detail_info = 'aicore_intermediate_{}_detail.csv'
|
||||
_aicore_data = []
|
||||
_aicore_detail_data = []
|
||||
_aicore_trace_data = []
|
||||
_col_names = []
|
||||
|
||||
def __init__(self, profiling_dir, device_id):
|
||||
self._profiling_dir = profiling_dir
|
||||
self._device_id = device_id
|
||||
self._op_time_cache = {}
|
||||
self._total_time = Decimal('0.0')
|
||||
|
||||
def integrate(self):
|
||||
"""Integrate the parsed profiling files."""
|
||||
self._parse_aicore_detail_time()
|
||||
self._parse_aicore_type_time()
|
||||
self._parse_aicpu_time()
|
||||
|
||||
def get_aicore_data(self):
|
||||
self._aicore_data_load()
|
||||
return self._aicore_data
|
||||
|
||||
def get_aicore_detail_data(self):
|
||||
self._aicore_detail_data_load()
|
||||
return self._aicore_detail_data
|
||||
|
||||
def get_aicore_trace_data(self):
|
||||
self._aicore_trace_data_load()
|
||||
return self._aicore_trace_data
|
||||
|
||||
def query_for_all_reduce(self):
|
||||
return self._query_for_all_reduce()
|
||||
|
||||
|
||||
def _parse_aicore_type_time(self):
|
||||
"""Parse the parsed AICORE operator type file."""
|
||||
framework_file = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._file_name_framework.format(self._device_id)
|
||||
)
|
||||
if not os.path.isfile(framework_file):
|
||||
return
|
||||
|
||||
op_name_type_cache = {}
|
||||
with open(framework_file, 'r') as src_file:
|
||||
csv_reader = csv.reader(src_file)
|
||||
_ = next(csv_reader)
|
||||
|
||||
for row in csv_reader:
|
||||
op_name_type_cache[row[3]] = row[5]
|
||||
|
||||
op_type_time_cache = {}
|
||||
for full_op_name, op_time in self._op_time_cache.items():
|
||||
op_type = op_name_type_cache.get(full_op_name)
|
||||
if op_type_time_cache.get(op_type) is None:
|
||||
op_type_time_cache[op_type] = [op_time, 1]
|
||||
else:
|
||||
op_type_time_cache[op_type][0] += op_time
|
||||
op_type_time_cache[op_type][1] += 1
|
||||
|
||||
op_type_file_name = 'aicore_intermediate_' + self._device_id + '_type.csv'
|
||||
op_type_file_path = os.path.join(self._profiling_dir, op_type_file_name)
|
||||
with open(op_type_file_path, 'w') as type_file:
|
||||
csv_writer = csv.writer(type_file)
|
||||
csv_writer.writerow(self._header_aicore_type)
|
||||
|
||||
for op_type, op_type_time_info in op_type_time_cache.items():
|
||||
type_info = [
|
||||
op_type, op_type_time_info[0], op_type_time_info[1],
|
||||
round((op_type_time_info[0] / self._total_time) * 100, 2)
|
||||
]
|
||||
csv_writer.writerow(type_info)
|
||||
|
||||
def _parse_aicore_detail_time(self):
|
||||
"""Parse the parsed AICORE operator time file."""
|
||||
aicore_detail_file = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._file_name_aicore_detail_time.format(self._device_id)
|
||||
)
|
||||
if not os.path.isfile(aicore_detail_file):
|
||||
return
|
||||
|
||||
op_detail_file_name = 'aicore_intermediate_' + self._device_id + '_detail.csv'
|
||||
op_detail_file_path = os.path.join(
|
||||
self._profiling_dir, op_detail_file_name
|
||||
)
|
||||
with open(aicore_detail_file, 'r') as src_file:
|
||||
row = src_file.readline()
|
||||
if row.startswith('op_name'):
|
||||
_ = src_file.readline()
|
||||
elif row.startswith('====='):
|
||||
_ = src_file.readline()
|
||||
_ = src_file.readline()
|
||||
else:
|
||||
return
|
||||
|
||||
with open(op_detail_file_path, 'w') as detail_file:
|
||||
csv_writer = csv.writer(detail_file)
|
||||
csv_writer.writerow(self._header_aicore_detail)
|
||||
|
||||
while True:
|
||||
row = src_file.readline()
|
||||
if not row:
|
||||
break
|
||||
|
||||
op_infos = row.split()
|
||||
if op_infos[0] == 'total':
|
||||
self._total_time = Decimal(op_infos[2])
|
||||
continue
|
||||
self._op_time_cache[op_infos[0]] = Decimal(op_infos[1])
|
||||
csv_writer.writerow([op_infos[0], op_infos[1]])
|
||||
|
||||
def _parse_aicpu_time(self):
|
||||
"""Parse the parsed AICPU operator time file."""
|
||||
aicpu_file = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._file_name_aicpu_time.format(self._device_id)
|
||||
)
|
||||
if not os.path.isfile(aicpu_file):
|
||||
return
|
||||
|
||||
save_file_name = 'aicpu_intermediate_' + self._device_id + '.csv'
|
||||
save_file_path = os.path.join(self._profiling_dir, save_file_name)
|
||||
with open(aicpu_file, 'r') as src_file:
|
||||
row = src_file.readline()
|
||||
if not row.startswith('serial_number'):
|
||||
return
|
||||
_ = src_file.readline()
|
||||
with open(save_file_path, 'w') as save_file:
|
||||
csv_writer = csv.writer(save_file)
|
||||
csv_writer.writerow(self._header_aicpu)
|
||||
|
||||
while True:
|
||||
row = src_file.readline()
|
||||
if not row:
|
||||
break
|
||||
infos = row.split()
|
||||
if infos[0] == 'AI':
|
||||
continue
|
||||
csv_writer.writerow(infos)
|
||||
|
||||
def _aicore_data_load(self):
|
||||
"""Load data according to the parsed AICORE operator types file."""
|
||||
op_type_file_path = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._file_name_aicore_type_time.format(self._device_id)
|
||||
)
|
||||
if not os.path.isfile(op_type_file_path):
|
||||
logger.warning('The file <%s> does not exist.', op_type_file_path)
|
||||
return
|
||||
|
||||
with open(op_type_file_path, 'r') as file:
|
||||
csv_reader = csv.reader(file)
|
||||
_ = next(csv_reader)
|
||||
for info in csv_reader:
|
||||
self._aicore_data.append([info[0], float(info[1]), int(info[2]), float(info[3])])
|
||||
|
||||
def _aicore_detail_data_load(self):
|
||||
"""Load data according to the parsed AICORE operator file."""
|
||||
op_detail_file_path = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._file_name_aicore_detail_info.format(self._device_id)
|
||||
)
|
||||
framework_file_path = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._file_name_framework.format(self._device_id)
|
||||
)
|
||||
if not os.path.isfile(op_detail_file_path):
|
||||
logger.warning('The file <%s> does not exist.', op_detail_file_path)
|
||||
return
|
||||
if not os.path.isfile(framework_file_path):
|
||||
logger.warning('The file <%s> does not exist.', framework_file_path)
|
||||
return
|
||||
|
||||
framework_infos = dict()
|
||||
with open(framework_file_path, 'r') as file:
|
||||
csv_reader = csv.reader(file)
|
||||
_ = next(csv_reader)
|
||||
for info in csv_reader:
|
||||
framework_infos[info[3]] = [
|
||||
info[3], info[4], info[5], info[6], json.loads(info[7]) if info[7] else None]
|
||||
|
||||
with open(op_detail_file_path, 'r') as file:
|
||||
csv_reader = csv.reader(file)
|
||||
_ = next(csv_reader)
|
||||
for info in csv_reader:
|
||||
framework_info = framework_infos.get(info[0])
|
||||
self._aicore_detail_data.append(
|
||||
[
|
||||
framework_info[1], framework_info[2], float(info[1]),
|
||||
framework_info[3], framework_info[0], framework_info[4]
|
||||
]
|
||||
)
|
||||
del framework_infos
|
||||
|
||||
|
||||
def _aicore_trace_data_load(self):
|
||||
"""Load data according to the parsed AICORE operator types file."""
|
||||
file_path = query_latest_trace_time_file(self._profiling_dir, int(self._device_id))
|
||||
if not file_path:
|
||||
logger.error("Failed to find parsed trace time file.")
|
||||
raise ProfilerFileNotFoundException('parsed step trace time file')
|
||||
with open(file_path, 'r') as handle:
|
||||
csv_reader = csv.reader(handle)
|
||||
self.__column__ = next(csv_reader)
|
||||
self._aicore_trace_data = list(csv_reader)
|
||||
self._size = len(self._aicore_trace_data) - 1
|
||||
self._display_col_names = self._col_names[:]
|
||||
self._load_point_info()
|
||||
|
||||
def _load_point_info(self):
|
||||
"""Load point info."""
|
||||
file_path = os.path.join(self._profiling_dir, 'step_trace_point_info.json')
|
||||
if os.path.isfile(file_path):
|
||||
with open(file_path, 'r', encoding='utf-8') as file:
|
||||
try:
|
||||
self._point_info = json.load(file)
|
||||
except (json.JSONDecodeError, TypeError) as err:
|
||||
logger.warning(err)
|
||||
raise ProfilerRawFileException('Fail to parse point info file.')
|
||||
|
||||
def _query_for_all_reduce(self):
|
||||
"""
|
||||
Query for all reduce info.
|
||||
|
||||
Returns:
|
||||
list[dict], reduce information. Each item is the reduce info for one step.
|
||||
The reduce info is format like:
|
||||
{stream_id: List[Tuple(start_point, end_point, duration, field_name)]}.
|
||||
"""
|
||||
self._aicore_trace_data_load()
|
||||
reduce_infos = []
|
||||
for row_info in self._aicore_trace_data[:-1]:
|
||||
row_info_dict = self._get_info_dict_from_row_data(row_info, 'systime')
|
||||
reduce_info = self._sort_reduce_by_time(row_info_dict)
|
||||
if reduce_info:
|
||||
reduce_infos.extend(reduce_info)
|
||||
|
||||
return reduce_infos
|
||||
|
||||
def _get_info_dict_from_row_data(self, row_info, time_type):
|
||||
"""
|
||||
Get step info in dict format.
|
||||
|
||||
Args:
|
||||
row_info (list[str]): Step info, the value is corresponding to `__column__`.
|
||||
time_type (str): The value type. `systime` keeps the original value.
|
||||
`realtime` transforms the value in millisecond. Default: `realtime`.
|
||||
|
||||
Returns:
|
||||
dict, step trace information. The key is in `__column__`.
|
||||
"""
|
||||
row_info_dict = {}
|
||||
for key, value in zip(self.__column__, row_info):
|
||||
if key == 'step_num':
|
||||
continue
|
||||
value = to_int(value, key)
|
||||
row_info_dict[key] = to_millisecond(value) if time_type == 'realtime' else value
|
||||
return row_info_dict
|
||||
|
||||
def _sort_reduce_by_time(self, row_info_dict):
|
||||
"""
|
||||
Sort reduce info by time.
|
||||
|
||||
Args:
|
||||
row_info_dict (dict): Step trace information.
|
||||
|
||||
Returns:
|
||||
list, including the all reduce info sorted by start time only.
|
||||
[
|
||||
[reduce_field, stream_id, reduce_start, reduce_duration],
|
||||
[...],
|
||||
[...]
|
||||
]
|
||||
"""
|
||||
factor = 1e5 # convert time unit from 10ns to 1ms
|
||||
reduce_pid = 10000
|
||||
reduce_info = []
|
||||
reduce_fields = [field_name for field_name in self.__column__
|
||||
if field_name.startswith('stream_') and not field_name.endswith('point')]
|
||||
for reduce_field in reduce_fields:
|
||||
reduce_start = row_info_dict.get(reduce_field + '_start_point')
|
||||
reduce_start = reduce_start / factor \
|
||||
if reduce_start else 0
|
||||
reduce_duration = row_info_dict.get(reduce_field)
|
||||
reduce_duration = reduce_duration / factor if reduce_duration else 0
|
||||
if not (reduce_start and reduce_duration):
|
||||
logger.info("Reduce event missing value.")
|
||||
continue
|
||||
cur_stream_id = reduce_field.split('_', 2)[1]
|
||||
reduce_meta = [reduce_field, int(cur_stream_id), reduce_start,
|
||||
reduce_duration, reduce_pid]
|
||||
reduce_info.append(reduce_meta)
|
||||
|
||||
return reduce_info
|
||||
|
||||
|
||||
class TimelineAnalyser:
|
||||
"""
|
||||
Analyse timeline data from file.
|
||||
"""
|
||||
__col_names__ = ['op_name', 'stream_id', 'start_time', 'duration']
|
||||
_output_timeline_data_file_path = 'output_timeline_data_{}.txt'
|
||||
_min_cycle_counter_file_path = 'min_cycle_counter_{}.txt'
|
||||
_display_filename = 'timeline_display_{}.json'
|
||||
_timeline_summary_filename = 'timeline_summary_{}.json'
|
||||
_timeline_meta = []
|
||||
_timeline_summary = {
|
||||
'total_time': 0,
|
||||
'num_of_streams': 0,
|
||||
'num_of_ops': 0,
|
||||
'op_exe_times': 0
|
||||
}
|
||||
|
||||
def __init__(self, profiling_dir, device_id):
|
||||
self._profiling_dir = profiling_dir
|
||||
self._device_id = device_id
|
||||
|
||||
def write_timeline(self):
|
||||
"""Load data according to the parsed profiling files."""
|
||||
# Write timeline to file.
|
||||
logger.info('Writing timeline file...')
|
||||
self.write_timeline_to_json_by_limitation()
|
||||
logger.info('Finished file writing!')
|
||||
|
||||
def write_timeline_to_json_by_limitation(self):
|
||||
"""Write timeline to json by limitation."""
|
||||
display_filename = self._display_filename.format(self._device_id)
|
||||
display_file_path = os.path.join(
|
||||
self._profiling_dir,
|
||||
display_filename
|
||||
)
|
||||
display_file_path = validate_and_normalize_path(display_file_path)
|
||||
|
||||
length = len(self._timeline_meta)
|
||||
try:
|
||||
with open(display_file_path, 'w') as json_file:
|
||||
json_file.write('[')
|
||||
for index, item in enumerate(self._timeline_meta):
|
||||
json.dump(item, json_file)
|
||||
file_size = os.path.getsize(display_file_path)
|
||||
if file_size > SIZE_LIMIT:
|
||||
break
|
||||
if index == length - 1:
|
||||
break
|
||||
json_file.write(',')
|
||||
json_file.write(']')
|
||||
except (IOError, OSError) as err:
|
||||
logger.error('Error occurred when write timeline display file: %s', err)
|
||||
raise ProfilerIOException
|
||||
|
||||
def write_timeline_summary(self):
|
||||
"""Write timeline summary to json."""
|
||||
timeline_summary_file_path = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._timeline_summary_filename.format(self._device_id)
|
||||
)
|
||||
|
||||
timeline_summary_file_path = validate_and_normalize_path(timeline_summary_file_path)
|
||||
|
||||
try:
|
||||
with open(timeline_summary_file_path, 'w') as json_file:
|
||||
json.dump(self._timeline_summary, json_file)
|
||||
except (IOError, OSError) as err:
|
||||
logger.error('Error occurred when write timeline summary file: %s', err)
|
||||
raise ProfilerIOException
|
||||
|
||||
def _load_timeline_data(self):
|
||||
"""Load timeline data from file."""
|
||||
file_path = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._output_timeline_data_file_path.format(self._device_id)
|
||||
)
|
||||
file_path = validate_and_normalize_path(file_path)
|
||||
if not os.path.exists(file_path):
|
||||
logger.error("Failed to find parsed timeline file.")
|
||||
raise ProfilerFileNotFoundException('parsed timeline file')
|
||||
|
||||
timeline_list = []
|
||||
try:
|
||||
with open(file_path, 'r') as f_obj:
|
||||
for line in f_obj:
|
||||
if not line.startswith('op_name'):
|
||||
line_list = line.strip('\n').split(',')
|
||||
timeline_list.append(line_list)
|
||||
except (IOError, OSError) as err:
|
||||
logger.error('Error occurred when read timeline intermediate file: %s', err)
|
||||
raise ProfilerIOException
|
||||
|
||||
return timeline_list
|
||||
|
||||
def _parse_timeline_data(self, timeline, min_cycle_counter):
|
||||
"""Parse timeline data."""
|
||||
# factor to convert the time unit from 1ms to 1us for timeline display
|
||||
factor = 1000
|
||||
op_meta = TimelineContainer(timeline)
|
||||
timeline_dict = {}
|
||||
timeline_dict['name'] = op_meta.op_name
|
||||
timeline_dict['ph'] = 'X'
|
||||
timeline_dict['tid'] = op_meta.stream_id
|
||||
timeline_dict['ts'] = (op_meta.start_time - min_cycle_counter) * factor
|
||||
dur = op_meta.duration * factor
|
||||
timeline_dict['dur'] = dur
|
||||
if op_meta.pid is None:
|
||||
timeline_dict['pid'] = int(self._device_id)
|
||||
# Update total time of operator execution.
|
||||
self._timeline_summary['total_time'] += dur
|
||||
else: # AllReduce and AI CPU pid
|
||||
timeline_dict['pid'] = op_meta.pid
|
||||
self._timeline_meta.append(timeline_dict)
|
||||
|
||||
@staticmethod
|
||||
def _update_num_of_streams(timeline, stream_count_dict):
|
||||
"""Update number of streams."""
|
||||
stream_id = timeline[1]
|
||||
if stream_id not in stream_count_dict.keys():
|
||||
stream_count_dict[stream_id] = 1
|
||||
else:
|
||||
stream_count_dict[stream_id] += 1
|
||||
|
||||
def get_min_cycle_counter(self):
|
||||
"""
|
||||
Get minimum cycle counter.
|
||||
|
||||
Returns:
|
||||
float, the minimum value of the cycle counter.
|
||||
"""
|
||||
file_path = os.path.join(
|
||||
self._profiling_dir,
|
||||
self._min_cycle_counter_file_path.format(self._device_id)
|
||||
)
|
||||
|
||||
file_path = validate_and_normalize_path(file_path)
|
||||
|
||||
if os.path.exists(file_path):
|
||||
try:
|
||||
with open(file_path, 'r') as f_obj:
|
||||
min_cycle_counter = f_obj.read()
|
||||
min_cycle_counter = float(min_cycle_counter) \
|
||||
if not min_cycle_counter == 'inf' else 0
|
||||
except (IOError, OSError) as err:
|
||||
logger.error('Error occurred when read minimum cycle counter: %s', err)
|
||||
raise ProfilerIOException
|
||||
else:
|
||||
min_cycle_counter = 0
|
||||
logger.info("No min cycle counter recorded.")
|
||||
|
||||
return min_cycle_counter
|
||||
|
||||
def init_timeline(self, all_reduce_info, framework_info, aicpu_info, min_cycle_counter):
|
||||
"""
|
||||
Init timeline metadata, adding all collected info.
|
||||
|
||||
Args:
|
||||
all_reduce_info (list[list]): The metadata of AllReduce operator.
|
||||
framework_info (dict): The framework metadata.
|
||||
aicpu_info (dict): The metadata of AI CPU operator.
|
||||
min_cycle_counter (float): The minimum cycle counter of the timeline.
|
||||
"""
|
||||
if min_cycle_counter == float('inf'):
|
||||
min_cycle_counter = 0
|
||||
|
||||
logger.info('Initiating timeline...')
|
||||
timeline_list = self._load_timeline_data()
|
||||
self._timeline_summary['op_exe_times'] = len(timeline_list)
|
||||
|
||||
# Add AllReduce info to timeline temp list and sort by start time.
|
||||
if all_reduce_info:
|
||||
logger.debug('AllReduce info found. Start adding info into timeline...')
|
||||
timeline_list.extend(all_reduce_info)
|
||||
timeline_list.sort(key=lambda x: float(x[2]))
|
||||
|
||||
# Add AI CPU data into timeline temp list and sort by start time.
|
||||
aicpu_data = aicpu_info.get('info')
|
||||
if aicpu_data:
|
||||
timeline_list.extend(aicpu_data)
|
||||
timeline_list.sort(key=lambda x: float(x[2]))
|
||||
self._timeline_summary['op_exe_times'] += aicpu_info.get('op_exe_times', 0)
|
||||
self._timeline_summary['num_of_streams'] += aicpu_info.get('num_of_streams', 0)
|
||||
self._timeline_summary['num_of_ops'] += aicpu_info.get('num_of_ops', 0)
|
||||
self._timeline_summary['total_time'] += aicpu_info.get('total_time', 0)
|
||||
|
||||
# Init a dict for counting the num of streams.
|
||||
stream_count_dict = {}
|
||||
for timeline in timeline_list:
|
||||
self._parse_timeline_data(timeline, min_cycle_counter)
|
||||
# Updating the collection of streams.
|
||||
if len(timeline) == 4:
|
||||
self._update_num_of_streams(timeline, stream_count_dict)
|
||||
|
||||
# Get framework metadata.
|
||||
framework_obj_list = framework_info.get('object')
|
||||
# The length of list is the number of operators.
|
||||
self._timeline_summary['num_of_ops'] += len(framework_obj_list)
|
||||
self._add_framework_info(framework_obj_list)
|
||||
logger.info('Finished adding info into timeline...')
|
||||
|
||||
# Update timeline summary info
|
||||
self._timeline_summary['num_of_streams'] += len(stream_count_dict.keys())
|
||||
|
||||
def _add_framework_info(self, framework_obj_list):
|
||||
"""
|
||||
Add framework info into timeline metadata.
|
||||
|
||||
Args:
|
||||
framework_obj_list (list): The framework metadata.
|
||||
"""
|
||||
logger.debug('Start adding framework info into timeline...')
|
||||
# Get the framework info that will be written into timeline.
|
||||
framework_info_dict = {}
|
||||
for framework_obj in framework_obj_list:
|
||||
op_name = framework_obj[0]
|
||||
op_type = framework_obj[1]
|
||||
op_full_name = framework_obj[4]
|
||||
op_info = framework_obj[5]
|
||||
framework_info_dict[op_full_name] = {
|
||||
'name': op_name,
|
||||
'args': {
|
||||
'type': op_type,
|
||||
'fullname': op_full_name
|
||||
}
|
||||
}
|
||||
framework_info_dict[op_full_name]['args'].update(op_info)
|
||||
|
||||
# Insert framework info into timeline.
|
||||
for timeline_item in self._timeline_meta:
|
||||
op_full_name = timeline_item.get('name')
|
||||
framework_item = framework_info_dict.get(op_full_name)
|
||||
if framework_item:
|
||||
timeline_item['name'] = framework_item.get('name')
|
||||
timeline_item['args'] = framework_item.get('args')
|
||||
|
||||
logger.debug('Finished adding framework info into timeline...')
|
|
@ -0,0 +1,88 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Minddata aicpu parser."""
|
||||
import os
|
||||
|
||||
from mindspore.profiler.common.util import get_file_join_name, fwrite_format
|
||||
from mindspore import log as logger
|
||||
|
||||
|
||||
class MinddataParser:
|
||||
"""Minddata Aicpu Parser."""
|
||||
@staticmethod
|
||||
def parse_minddata_aicpu_data(minddata_aicpu_source_path):
|
||||
"""
|
||||
Parse minddata get_next info which contains queue size and execute time.
|
||||
|
||||
Args:
|
||||
minddata_aicpu_source_path (str): the source file path.
|
||||
|
||||
Returns:
|
||||
list[Union[str, float]], the converted data.
|
||||
"""
|
||||
result = list()
|
||||
try:
|
||||
with open(minddata_aicpu_source_path) as source_data_file:
|
||||
source_data = source_data_file.read()
|
||||
step_data = source_data.split("\x00")
|
||||
for one_step in step_data:
|
||||
if one_step:
|
||||
node_info = one_step.split(", ")
|
||||
node_name, node_start, node_end, queue_size = "", 0, 0, 0
|
||||
if node_info:
|
||||
node_name = node_info[0].replace("Node:", "")
|
||||
if len(node_info) > 2:
|
||||
node_start = node_info[1].replace("Run start:", "")
|
||||
if node_start.isdigit():
|
||||
node_start = int(node_start)
|
||||
node_end = node_info[2].replace("Run end:", "")
|
||||
if node_end.isdigit():
|
||||
node_end = int(node_end)
|
||||
if len(node_info) > 3:
|
||||
queue_size = node_info[3].replace("queue size:", "")
|
||||
if queue_size.isdigit():
|
||||
queue_size = int(queue_size)
|
||||
|
||||
one_step_list = [node_name, node_start, node_end, queue_size]
|
||||
result.append(one_step_list)
|
||||
except OSError:
|
||||
logger.error("Open get_next profiling file error.")
|
||||
|
||||
return result
|
||||
|
||||
@staticmethod
|
||||
def execute(source_path, output_path, device_id):
|
||||
"""
|
||||
Execute the parser.
|
||||
|
||||
Args:
|
||||
source_path (str): the source file path.
|
||||
output_path (str): the output file path.
|
||||
device_id (str): the device id.
|
||||
"""
|
||||
col_names = ["node_name", "start_time", "end_time", "queue_size"]
|
||||
minddata_aicpu_source_path = get_file_join_name(
|
||||
input_path=source_path, file_name='DATA_PREPROCESS.dev.AICPUMI')
|
||||
if not minddata_aicpu_source_path:
|
||||
minddata_aicpu_source_path = get_file_join_name(
|
||||
input_path=os.path.join(source_path, "data"), file_name='DATA_PREPROCESS.dev.AICPUMI')
|
||||
if not minddata_aicpu_source_path:
|
||||
return
|
||||
minddata_aicpu_output_path = os.path.join(output_path, "minddata_aicpu_" + device_id + ".txt")
|
||||
|
||||
minddata_aicpu_data = MinddataParser.parse_minddata_aicpu_data(minddata_aicpu_source_path)
|
||||
if minddata_aicpu_data:
|
||||
fwrite_format(minddata_aicpu_output_path, " ".join(col_names), is_start=True)
|
||||
fwrite_format(minddata_aicpu_output_path, minddata_aicpu_data, is_start=True)
|
|
@ -0,0 +1,287 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Thr parser for parsing minddata pipeline files."""
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
from queue import Queue
|
||||
|
||||
from mindspore.profiler.common.exceptions.exceptions import \
|
||||
ProfilerPathErrorException, ProfilerFileNotFoundException, \
|
||||
ProfilerDirNotFoundException, ProfilerRawFileException
|
||||
from mindspore import log as logger
|
||||
from mindspore.profiler.common.validator.validate_path import \
|
||||
validate_and_normalize_path
|
||||
|
||||
|
||||
class MinddataPipelineParser:
|
||||
"""
|
||||
Thr parser for parsing minddata pipeline files.
|
||||
|
||||
Args:
|
||||
source_dir (str): The minddata pipeline source dir.
|
||||
device_id (str): The device ID.
|
||||
output_path (str): The directory of the parsed file. Default: `./`.
|
||||
|
||||
Raises:
|
||||
ProfilerPathErrorException: If the minddata pipeline file path or
|
||||
the output path is invalid.
|
||||
ProfilerFileNotFoundException: If the minddata pipeline file or
|
||||
the output dir does not exist.
|
||||
"""
|
||||
_raw_pipeline_file_name = 'pipeline_profiling_{}.json'
|
||||
_parsed_pipeline_file_name = 'minddata_pipeline_raw_{}.csv'
|
||||
_col_names = [
|
||||
'op_id', 'op_type', 'num_workers', 'output_queue_size',
|
||||
'output_queue_average_size', 'output_queue_length',
|
||||
'output_queue_usage_rate', 'sample_interval', 'parent_id', 'children_id'
|
||||
]
|
||||
|
||||
def __init__(self, source_dir, device_id, output_path='./'):
|
||||
self._device_id = device_id
|
||||
self._pipeline_path = self._get_pipeline_path(source_dir)
|
||||
self._save_path = self._get_save_path(output_path)
|
||||
|
||||
@property
|
||||
def save_path(self):
|
||||
"""
|
||||
The property of save path.
|
||||
|
||||
Returns:
|
||||
str, the save path.
|
||||
"""
|
||||
return self._save_path
|
||||
|
||||
def parse(self):
|
||||
"""
|
||||
Parse the minddata pipeline files.
|
||||
|
||||
Raises:
|
||||
ProfilerRawFileException: If fails to parse the raw file of
|
||||
minddata pipeline or the file is empty.
|
||||
"""
|
||||
with open(self._pipeline_path, 'r') as file:
|
||||
try:
|
||||
pipeline_info = json.load(file)
|
||||
except (json.JSONDecodeError, TypeError) as err:
|
||||
logger.warning(err)
|
||||
raise ProfilerRawFileException(
|
||||
'Fail to parse minddata pipeline file.'
|
||||
)
|
||||
if not pipeline_info:
|
||||
logger.warning('The minddata pipeline file is empty.')
|
||||
raise ProfilerRawFileException(
|
||||
'The minddata pipeline file is empty.'
|
||||
)
|
||||
|
||||
self._parse_and_save(pipeline_info)
|
||||
|
||||
def _get_pipeline_path(self, source_dir):
|
||||
"""
|
||||
Get the minddata pipeline file path.
|
||||
|
||||
Args:
|
||||
source_dir (str): The minddata pipeline source dir.
|
||||
|
||||
Returns:
|
||||
str, the minddata pipeline file path.
|
||||
"""
|
||||
pipeline_path = os.path.join(
|
||||
source_dir,
|
||||
self._raw_pipeline_file_name.format(self._device_id)
|
||||
)
|
||||
|
||||
try:
|
||||
pipeline_path = validate_and_normalize_path(pipeline_path)
|
||||
except RuntimeError:
|
||||
logger.warning('Minddata pipeline file is invalid.')
|
||||
raise ProfilerPathErrorException('Minddata pipeline file is invalid.')
|
||||
if not os.path.isfile(pipeline_path):
|
||||
logger.warning(
|
||||
'The minddata pipeline file <%s> not found.', pipeline_path
|
||||
)
|
||||
raise ProfilerFileNotFoundException(pipeline_path)
|
||||
|
||||
return pipeline_path
|
||||
|
||||
def _get_save_path(self, output_path):
|
||||
"""
|
||||
Get the save path.
|
||||
|
||||
Args:
|
||||
output_path (str): The output dir.
|
||||
|
||||
Returns:
|
||||
str, the save path.
|
||||
"""
|
||||
try:
|
||||
output_dir = validate_and_normalize_path(output_path)
|
||||
except ValidationError:
|
||||
logger.warning('Output path is invalid.')
|
||||
raise ProfilerPathErrorException('Output path is invalid.')
|
||||
if not os.path.isdir(output_dir):
|
||||
logger.warning('The output dir <%s> not found.', output_dir)
|
||||
raise ProfilerDirNotFoundException(output_dir)
|
||||
return os.path.join(
|
||||
output_dir, self._parsed_pipeline_file_name.format(self._device_id)
|
||||
)
|
||||
|
||||
def _parse_and_save(self, pipeline_info):
|
||||
"""
|
||||
Parse and save the parsed minddata pipeline file.
|
||||
|
||||
Args:
|
||||
pipeline_info (dict): The pipeline info reads from the raw file of
|
||||
the minddata pipeline.
|
||||
|
||||
Raises:
|
||||
ProfilerRawFileException: If the format of minddata pipeline raw
|
||||
file is wrong.
|
||||
"""
|
||||
sample_interval = pipeline_info.get('sampling_interval')
|
||||
op_info = pipeline_info.get('op_info')
|
||||
if sample_interval is None or not op_info:
|
||||
raise ProfilerRawFileException(
|
||||
'The format of minddata pipeline raw file is wrong.'
|
||||
)
|
||||
|
||||
op_id_info_cache = {}
|
||||
for item in op_info:
|
||||
op_id_info_cache[item.get('op_id')] = item
|
||||
|
||||
with open(self._save_path, 'w') as save_file:
|
||||
csv_writer = csv.writer(save_file)
|
||||
csv_writer.writerow(self._col_names)
|
||||
self._parse_and_save_op_info(
|
||||
csv_writer, op_id_info_cache, sample_interval
|
||||
)
|
||||
|
||||
def _parse_and_save_op_info(self, csv_writer, op_id_info_cache,
|
||||
sample_interval):
|
||||
"""
|
||||
Parse and save the minddata pipeline operator information.
|
||||
|
||||
Args:
|
||||
csv_writer (csv.writer): The csv writer.
|
||||
op_id_info_cache (dict): The operator id and information cache.
|
||||
sample_interval (int): The sample interval.
|
||||
|
||||
Raises:
|
||||
ProfilerRawFileException: If the operator that id is 0 does not exist.
|
||||
"""
|
||||
queue = Queue()
|
||||
root_node = op_id_info_cache.get(0)
|
||||
if not root_node:
|
||||
raise ProfilerRawFileException(
|
||||
'The format of minddata pipeline raw file is wrong, '
|
||||
'the operator that id is 0 does not exist.'
|
||||
)
|
||||
root_node['parent_id'] = None
|
||||
queue.put_nowait(root_node)
|
||||
|
||||
while not queue.empty():
|
||||
node = queue.get_nowait()
|
||||
self._update_child_node(node, op_id_info_cache)
|
||||
csv_writer.writerow(self._get_op_info(node, sample_interval))
|
||||
|
||||
op_id = node.get('op_id')
|
||||
children_ids = node.get('children')
|
||||
if not children_ids:
|
||||
continue
|
||||
for child_op_id in children_ids:
|
||||
sub_node = op_id_info_cache.get(child_op_id)
|
||||
sub_node['parent_id'] = op_id
|
||||
queue.put_nowait(sub_node)
|
||||
|
||||
def _update_child_node(self, node, op_id_info_cache):
|
||||
"""
|
||||
Updates the child node information of the operator.
|
||||
|
||||
Args:
|
||||
node (dict): The node represents an operator.
|
||||
op_id_info_cache (dict): The operator id and information cache.
|
||||
"""
|
||||
child_op_ids = node.get('children')
|
||||
if not child_op_ids:
|
||||
return
|
||||
|
||||
queue = Queue()
|
||||
self._cp_list_item_to_queue(child_op_ids, queue)
|
||||
|
||||
new_child_op_ids = []
|
||||
while not queue.empty():
|
||||
child_op_id = queue.get_nowait()
|
||||
child_node = op_id_info_cache.get(child_op_id)
|
||||
if child_node is None:
|
||||
continue
|
||||
metrics = child_node.get('metrics')
|
||||
if not metrics or not metrics.get('output_queue'):
|
||||
op_ids = child_node.get('children')
|
||||
if op_ids:
|
||||
self._cp_list_item_to_queue(op_ids, queue)
|
||||
else:
|
||||
new_child_op_ids.append(child_op_id)
|
||||
|
||||
node['children'] = new_child_op_ids
|
||||
|
||||
def _get_op_info(self, op_node, sample_interval):
|
||||
"""
|
||||
Get the operator information.
|
||||
|
||||
Args:
|
||||
op_node (dict): The node represents an operator.
|
||||
sample_interval (int): The sample interval.
|
||||
|
||||
Returns:
|
||||
list[str, int, float], the operator information.
|
||||
"""
|
||||
queue_size = None
|
||||
queue_average_size = None
|
||||
queue_length = None
|
||||
queue_usage_rate = None
|
||||
metrics = op_node.get('metrics')
|
||||
if metrics:
|
||||
output_queue = metrics.get('output_queue')
|
||||
if output_queue:
|
||||
queue_size = output_queue.get('size')
|
||||
queue_average_size = sum(queue_size) / len(queue_size)
|
||||
queue_length = output_queue.get('length')
|
||||
queue_usage_rate = queue_average_size / queue_length
|
||||
|
||||
children_id = op_node.get('children')
|
||||
op_info = [
|
||||
op_node.get('op_id'),
|
||||
op_node.get('op_type'),
|
||||
op_node.get('num_workers'),
|
||||
queue_size,
|
||||
queue_average_size,
|
||||
queue_length,
|
||||
queue_usage_rate,
|
||||
sample_interval,
|
||||
op_node.get('parent_id'),
|
||||
children_id if children_id else None
|
||||
]
|
||||
return op_info
|
||||
|
||||
def _cp_list_item_to_queue(self, inner_list, queue):
|
||||
"""
|
||||
Copy the contents of a list to a queue.
|
||||
|
||||
Args:
|
||||
inner_list (list): The list.
|
||||
queue (Queue): The target queue.
|
||||
"""
|
||||
for item in inner_list:
|
||||
queue.put_nowait(item)
|
|
@ -0,0 +1,245 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Op compute time files parser."""
|
||||
import os
|
||||
from mindspore.profiler.common.util import fwrite_format
|
||||
from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
|
||||
ProfilerIOException
|
||||
from mindspore import log as logger
|
||||
from mindspore.profiler.common.validator.validate_path import validate_and_normalize_path
|
||||
from mindspore.profiler.parser.container import HWTSContainer
|
||||
|
||||
TIMELINE_FILE_COLUMN_TITLE = 'op_name, stream_id, start_time(ms), duration(ms)'
|
||||
|
||||
class OPComputeTimeParser:
|
||||
"""
|
||||
Join hwts info and framework info, get op time info, and output to the result file.
|
||||
|
||||
Args:
|
||||
hwts_output_file (str): The file path of hwts_output_file. Such as: './output_format_data_hwts_0.txt".
|
||||
output_filename (str): The output data file path and name. Such as: './output_op_compute_time_0.txt'.
|
||||
op_task_info (dict): The task and op relation info. The format: {task_id, [opname, stream_id, block dim]}.
|
||||
"""
|
||||
|
||||
_dst_file_title = 'title:op compute time'
|
||||
_dst_file_column_title = 'op_name compute_time(ms) stream_id'
|
||||
_dst_file_column_title += '\n------------ --------------- ---------'
|
||||
|
||||
def __init__(self, hwts_output_file, output_filename, op_task_info,
|
||||
output_path, device_id):
|
||||
hwts_output_file = validate_and_normalize_path(hwts_output_file)
|
||||
self._hwts_output_file = hwts_output_file
|
||||
self._output_filename = output_filename
|
||||
self._op_task_info = op_task_info
|
||||
self._output_path = output_path
|
||||
self._device_id = device_id
|
||||
self._min_cycle_counter = float("inf")
|
||||
|
||||
def _get_op_task_id_map(self):
|
||||
"""
|
||||
Read hwts data file, get the task time info.
|
||||
|
||||
Returns:
|
||||
list: all hwts task time info.
|
||||
"""
|
||||
|
||||
op_map_result = []
|
||||
hwts_list = []
|
||||
|
||||
if not os.path.exists(self._hwts_output_file):
|
||||
logger.error('The hwts output file does not exist.')
|
||||
raise ProfilerFileNotFoundException('hwts output file')
|
||||
|
||||
with open(self._hwts_output_file, 'r') as data_file:
|
||||
lines = data_file.readlines()
|
||||
for line in lines:
|
||||
if line.startswith("Start of task") or line.startswith("End of task"):
|
||||
line_split = line.split()
|
||||
container = HWTSContainer(line_split)
|
||||
hwts_list.append(container)
|
||||
|
||||
# hwts op map by taskId
|
||||
for hwts in hwts_list:
|
||||
if hwts.task_id in self._op_task_info.keys():
|
||||
hwts.op_name = self._op_task_info[hwts.task_id]
|
||||
op_map_result.append(hwts)
|
||||
|
||||
return op_map_result
|
||||
|
||||
def execute(self):
|
||||
"""Execute the parser, compute all op, get op time, and write it to the output file."""
|
||||
# Calculate the execution time of operators,
|
||||
# and update the minimum cycle counter.
|
||||
tmp_result_data = self._calculate_op_execution_time()
|
||||
|
||||
# Convert time units from nanoseconds to milliseconds.
|
||||
# The unit of the cycle counter is 10 nanoseconds.
|
||||
op_name_time_dict = {}
|
||||
op_name_stream_dict = {}
|
||||
op_name_count_dict = {}
|
||||
op_name_task_dict = {}
|
||||
op_name_start_time = {}
|
||||
self._convert_op_time_unit(
|
||||
tmp_result_data, op_name_time_dict, op_name_stream_dict,
|
||||
op_name_count_dict, op_name_task_dict, op_name_start_time
|
||||
)
|
||||
|
||||
result_data = ""
|
||||
total_time = 0
|
||||
for op_name, time in op_name_time_dict.items():
|
||||
if op_name in op_name_stream_dict.keys():
|
||||
stream_id = op_name_stream_dict[op_name]
|
||||
avg_time = time / op_name_count_dict[op_name]
|
||||
total_time += avg_time
|
||||
result_data += ("%s %s %s\n" %(op_name, str(avg_time), stream_id))
|
||||
result_data += ("total op %s 0" %(str(total_time)))
|
||||
|
||||
timeline_data = []
|
||||
for op_name, time in op_name_time_dict.items():
|
||||
if op_name in op_name_stream_dict.keys():
|
||||
stream_id = op_name_stream_dict[op_name]
|
||||
start_time_list = op_name_start_time.get(op_name)
|
||||
for (start_time, duration) in start_time_list:
|
||||
timeline_data.append([op_name, stream_id, start_time, duration])
|
||||
|
||||
# Write the metadata of operators into the file,
|
||||
# including operator name, average time, and stream id.
|
||||
self._write_op_time_into_file(result_data)
|
||||
# Write the timeline data into file,
|
||||
# including operator name, stream id, start time, and duration.
|
||||
self._write_timeline_data_into_file(timeline_data)
|
||||
|
||||
def _write_op_time_into_file(self, result_data):
|
||||
"""
|
||||
Write the metadata of operators into the file, including
|
||||
op name, average time, and stream id.
|
||||
|
||||
Args:
|
||||
result_data (str): The metadata to be written into the file.
|
||||
'op_name_1', 'avg_time_1', 'stream_id_1',
|
||||
'op_name_2', 'avg_time_2', 'stream_id_2',
|
||||
...
|
||||
"""
|
||||
|
||||
fwrite_format(self._output_filename, data_source=self._dst_file_title, is_start=True)
|
||||
fwrite_format(self._output_filename, data_source=self._dst_file_column_title)
|
||||
fwrite_format(self._output_filename, data_source=result_data)
|
||||
|
||||
def _write_timeline_data_into_file(self, timeline_data):
|
||||
"""
|
||||
Write the timeline information into the file, including
|
||||
operator name, stream id, start time and duration.
|
||||
|
||||
Args:
|
||||
timeline_data (list): The metadata to be written into the file.
|
||||
[
|
||||
['op_name_1', 'stream_id_1', 'start_time_1', 'durarion_1'],
|
||||
['op_name_2', 'stream_id_2', 'start_time_2', 'durarion_2'],
|
||||
[...]
|
||||
]
|
||||
"""
|
||||
# sorted by start times
|
||||
timeline_data.sort(key=lambda x: float(x[2]))
|
||||
filename = 'output_timeline_data_{}.txt'.format(self._device_id)
|
||||
file_path = os.path.join(self._output_path, filename)
|
||||
file_path = validate_and_normalize_path(file_path)
|
||||
|
||||
# write to file
|
||||
try:
|
||||
with open(file_path, 'w') as f_obj:
|
||||
f_obj.write(TIMELINE_FILE_COLUMN_TITLE + '\n')
|
||||
for timeline in timeline_data:
|
||||
timeline = [str(item) for item in timeline]
|
||||
f_obj.write(','.join(timeline) + '\n')
|
||||
except (IOError, OSError) as err:
|
||||
logger.error('Error occurred when writing intermediate timeline file: %s', err)
|
||||
raise ProfilerIOException
|
||||
|
||||
def _calculate_op_execution_time(self):
|
||||
"""
|
||||
Calculate the execution time of each operator.
|
||||
|
||||
Returns:
|
||||
list, including the intermediate data of op execution time.
|
||||
"""
|
||||
tmp_result_data = []
|
||||
op_map_list = self._get_op_task_id_map()
|
||||
|
||||
cur_index = 0
|
||||
length = len(op_map_list)
|
||||
min_cycle_counter = float("inf")
|
||||
while cur_index < length:
|
||||
if cur_index + 1 == length:
|
||||
break
|
||||
|
||||
op_start = op_map_list[cur_index]
|
||||
op_end = op_map_list[cur_index + 1]
|
||||
if op_start.status == "Start" and op_end.status == "End" \
|
||||
and op_start.op_name == op_end.op_name:
|
||||
op_start.duration = op_end.cycle_counter - op_start.cycle_counter
|
||||
tmp_result_data.append(op_start)
|
||||
cur_index += 2
|
||||
if not op_start.op_name.startswith("assign"):
|
||||
min_cycle_counter = min(min_cycle_counter, op_start.cycle_counter)
|
||||
else:
|
||||
cur_index += 1
|
||||
|
||||
# Update the value of minimum cycle counter.
|
||||
self._min_cycle_counter = min_cycle_counter / 1e5 # Convert the time unit from 10ns to 1ms
|
||||
|
||||
return tmp_result_data
|
||||
|
||||
def _convert_op_time_unit(self, op_data_list, op_name_time_dict, op_name_stream_dict,
|
||||
op_name_count_dict, op_name_task_dict, op_name_start_time):
|
||||
"""
|
||||
Calculate the execution time of operator and convert it into millisecond.
|
||||
|
||||
Args:
|
||||
op_data_list (list): The list of operator metadata.
|
||||
op_name_time_dict (dict): The mapping relation of operator name and its execution time.
|
||||
op_name_stream_dict (dict): The mapping relation of operator name and its stream id.
|
||||
op_name_count_dict (dict): The mapping relation of operator name and its count.
|
||||
op_name_task_dict (dict): The mapping relation of operator name and its task id.
|
||||
op_name_start_time (dict): The mapping relation of operator name and its start time.
|
||||
"""
|
||||
factor = 1e5
|
||||
for item in op_data_list:
|
||||
op_name = item.op_name
|
||||
# Unit conversion: converting the cycle counter into ms.
|
||||
op_start_time_str = str(item.cycle_counter / factor)
|
||||
op_duration = item.duration / factor
|
||||
op_duration_str = str(item.duration / factor)
|
||||
if op_name in op_name_time_dict.keys():
|
||||
op_name_time_dict[op_name] += op_duration
|
||||
if item.task_id == op_name_task_dict[op_name]:
|
||||
op_name_count_dict[op_name] += 1
|
||||
op_name_start_time[op_name].append(
|
||||
(op_start_time_str, op_duration_str)
|
||||
)
|
||||
|
||||
else:
|
||||
op_name_time_dict[op_name] = op_duration
|
||||
op_name_stream_dict[op_name] = item.stream_id
|
||||
op_name_task_dict[op_name] = item.task_id
|
||||
op_name_count_dict[op_name] = 1
|
||||
op_name_start_time[op_name] = []
|
||||
op_name_start_time[op_name].append(
|
||||
(op_start_time_str, op_duration_str)
|
||||
)
|
||||
|
||||
@property
|
||||
def min_cycle_counter(self):
|
||||
"""Get minimum cycle counter."""
|
||||
return self._min_cycle_counter
|
|
@ -0,0 +1,382 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""The parser for step trace data."""
|
||||
import csv
|
||||
import json
|
||||
import os
|
||||
import stat
|
||||
import struct
|
||||
from collections import namedtuple
|
||||
from decimal import Decimal
|
||||
|
||||
from mindspore.profiler.common.exceptions.exceptions import ProfilerPathErrorException, \
|
||||
JobIdMismatchException, ProfilerIOException
|
||||
from mindspore import log
|
||||
from mindspore.profiler.common.util import get_summary_for_step_trace
|
||||
|
||||
StepTraceStruct = namedtuple(
|
||||
'TrainingTraceStruct', ['tag_id', 'task_id', 'stream_id', 'sys_count']
|
||||
)
|
||||
|
||||
|
||||
class StepTraceParser:
|
||||
"""
|
||||
The parser for step trace data.
|
||||
|
||||
Args:
|
||||
input_dir (str): The directory that contains original step trace data.
|
||||
output_file_path (str): The output file path.
|
||||
job_id (int): The job id used to define the start of new step. Default: 0.
|
||||
skip_first_step (bool): Whether skip the first step or not.
|
||||
"""
|
||||
_event_size = 20
|
||||
_fp_tag = 1
|
||||
_bp_tag = 2
|
||||
_end_tag = 255
|
||||
|
||||
def __init__(self, input_dir, output_file_path, job_id=0, skip_first_step=False):
|
||||
self._input_dir = input_dir
|
||||
self._output_path = output_file_path
|
||||
self._job_id = job_id
|
||||
self._skip_first_step = skip_first_step
|
||||
self._result = []
|
||||
self._header = []
|
||||
self._step_num = 0
|
||||
self._tag_map = {}
|
||||
|
||||
@property
|
||||
def output_file(self):
|
||||
"""The property of step trace header."""
|
||||
file_name = self._output_path.rsplit('/', 2)
|
||||
return file_name[-1] if len(file_name) == 3 else ''
|
||||
|
||||
def show(self):
|
||||
"""The property of step trace info."""
|
||||
summary_info = {}
|
||||
if self._result:
|
||||
summary_info = get_summary_for_step_trace(self._result[-1], self._header)
|
||||
summary_info['total_steps'] = len(self._result) - 1
|
||||
print('\nStep trace summary info (unit: syscnt):')
|
||||
print(summary_info)
|
||||
print('\nThe step trace parse result saves under ${summary_dir}/profiler/%s'
|
||||
% self.output_file)
|
||||
|
||||
def parse_and_save(self):
|
||||
"""Parse step trace files and save the result."""
|
||||
try:
|
||||
source_files = self._get_step_trace_files()
|
||||
self._parse(source_files)
|
||||
self._save()
|
||||
except IOError as err:
|
||||
log.warning(err)
|
||||
raise ProfilerIOException()
|
||||
else:
|
||||
log.info("Finish to save intermediate result for step trace file.")
|
||||
|
||||
def record_point_info(self, point_info, output_path):
|
||||
"""
|
||||
Record point info into json.
|
||||
|
||||
Args:
|
||||
point_info (dict): The point info about tag id and relative op name.
|
||||
output_path (str): The output path for saving point info.
|
||||
|
||||
Returns:
|
||||
dict, parsed point info.
|
||||
"""
|
||||
points = {
|
||||
'fp_start': point_info.get(self._fp_tag, ''),
|
||||
'bp_end': point_info.get(self._bp_tag, '')
|
||||
}
|
||||
try:
|
||||
with open(output_path, 'w') as json_file:
|
||||
json.dump(points, json_file)
|
||||
os.chmod(output_path, stat.S_IREAD)
|
||||
except (IOError, OSError) as err:
|
||||
log.warning('Failed to save point info. %s', err)
|
||||
raise ProfilerIOException
|
||||
return points
|
||||
|
||||
def update_tag_op_type_map(self, point_info):
|
||||
"""
|
||||
update the map from tag id to op type.
|
||||
|
||||
Args:
|
||||
point_info (dict): The point info about tag id and relative op name.
|
||||
"""
|
||||
tag_map = {}
|
||||
for tag, op_name in point_info.items():
|
||||
op_type = self._get_op_type(tag, op_name)
|
||||
tag_map[tag] = op_type
|
||||
log.info("Get tag types for step trace analysis: %s", tag_map)
|
||||
self._tag_map = tag_map
|
||||
|
||||
def _get_op_type(self, tag, name):
|
||||
"""
|
||||
Get op type from tag and name.
|
||||
|
||||
Args:
|
||||
tag (int): The tag id.
|
||||
name (str): The op name.
|
||||
|
||||
Returns:
|
||||
str, the op type.
|
||||
"""
|
||||
tag_map = {self._fp_tag: 'fp', self._bp_tag: 'bp', self._end_tag: 'end'}
|
||||
# get solid tag type
|
||||
op_type = tag_map.get(tag, '')
|
||||
if op_type:
|
||||
return op_type
|
||||
# check if the tag is step tag.
|
||||
if tag > self._end_tag or tag == 0:
|
||||
return 'start'
|
||||
# analyze the reduce tag
|
||||
op_type = name.rsplit('/', 1)[-1].split('-')[0]
|
||||
if not op_type:
|
||||
log.warning("Unexpected op name:%s", name)
|
||||
|
||||
return op_type
|
||||
|
||||
def _get_step_trace_files(self):
|
||||
"""Get step trace files."""
|
||||
# step trace files may under $profiler_dir or $profiler_dir/data
|
||||
profiler_dir = self._input_dir
|
||||
step_trace_files = self._search_file(profiler_dir)
|
||||
if not step_trace_files:
|
||||
# try to find step trace files under $profiler_dir/data
|
||||
profiler_dir = os.path.join(profiler_dir, 'data')
|
||||
step_trace_files = self._search_file(profiler_dir)
|
||||
if not step_trace_files:
|
||||
raise ProfilerPathErrorException('Training trace file does not exist.')
|
||||
|
||||
return step_trace_files
|
||||
|
||||
@staticmethod
|
||||
def _search_file(input_dir):
|
||||
"""Search step trace file under specific input directory."""
|
||||
# validate input_dir
|
||||
if not os.path.isdir(input_dir):
|
||||
raise ProfilerPathErrorException(
|
||||
'{} does not exist or is not a dir'.format(input_dir)
|
||||
)
|
||||
# get step trace files
|
||||
files = os.listdir(input_dir)
|
||||
step_trace_files = list(
|
||||
filter(
|
||||
lambda file: file.startswith('training_trace') and not file.endswith('.done'),
|
||||
files
|
||||
)
|
||||
)
|
||||
# validate result
|
||||
if len(step_trace_files) > 1:
|
||||
# the format of file name is like
|
||||
# `training_trace.46.dev.profiler_default_tag.$id.slice_$number`
|
||||
# use the $number as the sorted key
|
||||
try:
|
||||
step_trace_files.sort(key=lambda path: int(path.rsplit('_', 1)[-1]))
|
||||
except ValueError as err:
|
||||
log.warning("Unable to parse file names: %s. %s", step_trace_files, err)
|
||||
step_trace_files = []
|
||||
|
||||
file_paths = [os.path.join(input_dir, file) for file in step_trace_files]
|
||||
log.info("Find %d step trace files.", len(file_paths))
|
||||
return file_paths
|
||||
|
||||
def _parse(self, source_files):
|
||||
"""Parse source step trace files."""
|
||||
log.info("Start to parse step trace file.")
|
||||
event_info = {}
|
||||
for source_file in source_files:
|
||||
with open(source_file, 'rb') as handler:
|
||||
content = handler.read()
|
||||
for step_trace in self._get_next_step_trace(content, event_info):
|
||||
if self._skip_first_step:
|
||||
self._skip_first_step = False
|
||||
continue
|
||||
self._record_trace_event(step_trace)
|
||||
self._record_average_info()
|
||||
log.info("Finish to parse step trace file.")
|
||||
|
||||
def _get_next_step_trace(self, content, event_info):
|
||||
"""
|
||||
Get next step trace info.
|
||||
|
||||
Args:
|
||||
content (bytes): The input step trace info.
|
||||
event_info (dict): The event info.
|
||||
|
||||
Returns:
|
||||
Generator, return the step trace one by one.
|
||||
"""
|
||||
for pos in range(0, len(content), 20):
|
||||
next_event = self._get_trace_struct(content[pos:pos + self._event_size])
|
||||
self._construct_event_info(next_event, event_info)
|
||||
if event_info.get('end'):
|
||||
yield event_info
|
||||
|
||||
def _get_trace_struct(self, bin_info):
|
||||
"""Translate event info to StepTraceStruct."""
|
||||
if len(bin_info) == self._event_size:
|
||||
parsed_info = struct.unpack('=QHHQ', bin_info)
|
||||
return StepTraceStruct(*parsed_info)
|
||||
return None
|
||||
|
||||
def _construct_event_info(self, next_event, event_info):
|
||||
"""Construct event info according to next_event."""
|
||||
min_job_id = 255
|
||||
step_flag: bool = lambda tag: tag > min_job_id or tag == 0
|
||||
end_flag: bool = lambda tag: tag == min_job_id
|
||||
fp_flag: bool = lambda tag: tag == self._fp_tag
|
||||
bp_flag: bool = lambda tag: tag == self._bp_tag
|
||||
|
||||
def _on_step_event():
|
||||
"""Handle step event."""
|
||||
self._validate_tag_id(tag_id)
|
||||
start_time = event_info.get('end', '-')
|
||||
event_info.clear()
|
||||
event_info['start'] = start_time
|
||||
event_info['reduce'] = {}
|
||||
|
||||
def _on_reduce_event(reduce_tag_id):
|
||||
"""Handle reduce event."""
|
||||
stream_id = next_event.stream_id
|
||||
if event_info['reduce'].get(stream_id):
|
||||
event_info['reduce'][stream_id].append((reduce_tag_id, sys_count))
|
||||
else:
|
||||
event_info['reduce'][stream_id] = [(reduce_tag_id, sys_count)]
|
||||
|
||||
tag_id = next_event.tag_id
|
||||
sys_count = next_event.sys_count
|
||||
if end_flag(tag_id):
|
||||
event_info['end'] = sys_count
|
||||
elif step_flag(tag_id):
|
||||
_on_step_event()
|
||||
elif fp_flag(tag_id):
|
||||
event_info['fp'] = sys_count
|
||||
elif bp_flag(tag_id):
|
||||
event_info['bp'] = sys_count
|
||||
else:
|
||||
_on_reduce_event(tag_id)
|
||||
|
||||
def _validate_tag_id(self, job_id):
|
||||
"""Check the job id in source step trace file is same as user set."""
|
||||
if not self._job_id:
|
||||
self._job_id = job_id
|
||||
elif self._job_id != job_id:
|
||||
raise JobIdMismatchException()
|
||||
|
||||
def _record_trace_event(self, step_trace):
|
||||
"""Record trace event."""
|
||||
self._step_num += 1
|
||||
start_time = step_trace.get('start')
|
||||
end_time = step_trace.get('end')
|
||||
fp_time = step_trace.get('fp')
|
||||
bp_time = step_trace.get('bp')
|
||||
if not (start_time and end_time and fp_time and bp_time):
|
||||
log.warning("The step %d lacks basic time.", self._step_num)
|
||||
return
|
||||
if start_time == '-':
|
||||
start_time = fp_time
|
||||
row_data = {
|
||||
'step_num': self._step_num,
|
||||
'start_point': start_time,
|
||||
'end_point': end_time,
|
||||
'total': end_time - start_time,
|
||||
'fp_point': fp_time,
|
||||
'bp_point': bp_time,
|
||||
'iteration_interval': fp_time - start_time,
|
||||
'fp_and_bp': bp_time - fp_time,
|
||||
'tail': end_time - bp_time
|
||||
}
|
||||
# update reduce info
|
||||
self._update_reduce_info(step_trace, row_data)
|
||||
# save the row data
|
||||
if not self._header:
|
||||
self._header = list(row_data.keys())
|
||||
row_data_list = [row_data.get(header_name, 0) for header_name in self._header]
|
||||
self._result.append(row_data_list)
|
||||
|
||||
def _update_reduce_info(self, step_trace, row_data):
|
||||
"""Extract reduce info."""
|
||||
reduce_time = step_trace.get('reduce', {})
|
||||
for stream_id, time_points in reduce_time.items():
|
||||
time_point_num = len(time_points)
|
||||
if time_point_num % 2:
|
||||
log.warning("Stream %d has %d reduce time points.", stream_id, time_point_num)
|
||||
continue
|
||||
for index, point_id in enumerate(range(0, time_point_num, 2)):
|
||||
field_name = f'stream_{stream_id}_{index}'
|
||||
reduce_info = self._get_single_reduce_event_info(
|
||||
field_name, time_points[point_id], time_points[point_id + 1])
|
||||
row_data.update(reduce_info)
|
||||
|
||||
def _get_single_reduce_event_info(self, field_name, start_point, end_point):
|
||||
"""
|
||||
Get single reduce info.
|
||||
|
||||
Args:
|
||||
field_name (str): The field name.
|
||||
start_point (Tuple[int, int]): Start point time info, including (tag_id, sys_count).
|
||||
end_point (Tuple[int, int]): End point time info, including (tag_id, sys_count).
|
||||
|
||||
Returns:
|
||||
dict, reduce info.
|
||||
"""
|
||||
reduce_info = {}
|
||||
if end_point[0] - start_point[0] != 1 or end_point[0] % 2:
|
||||
log.warning("Unmatched reduce event <%s, %s>.", start_point, end_point)
|
||||
return reduce_info
|
||||
op_type = self._tag_map.get(start_point[0])
|
||||
# append field name with op type.
|
||||
if not op_type:
|
||||
log.warning("Can't recognize the inner type for point tag: %d.", start_point[0])
|
||||
field_name += '_parallel'
|
||||
else:
|
||||
field_name += '_' + op_type
|
||||
reduce_info[field_name] = end_point[1] - start_point[1]
|
||||
reduce_info[field_name + '_start_point'] = start_point[1]
|
||||
reduce_info[field_name + '_end_point'] = end_point[1]
|
||||
|
||||
return reduce_info
|
||||
|
||||
def _record_average_info(self):
|
||||
"""Calculate average info."""
|
||||
result_size = len(self._result)
|
||||
# calculate average data for each column in result data
|
||||
average_data = [0] * len(self._header)
|
||||
if result_size >= 2:
|
||||
for row_info in self._result[1:]:
|
||||
average_data = [
|
||||
Decimal(i) + Decimal(j) for i, j in zip(row_info, average_data)
|
||||
]
|
||||
average_data = [
|
||||
round((item / (result_size - 1))) for item in average_data
|
||||
]
|
||||
# change step num info in average_data to None
|
||||
step_num_index = self._header.index('step_num')
|
||||
average_data[step_num_index] = '-'
|
||||
self._result.append(average_data)
|
||||
log.info("Finish add average info for step trace.")
|
||||
|
||||
def _save(self):
|
||||
log.info("Start to save step trace file.")
|
||||
if not self._header:
|
||||
return
|
||||
with open(self._output_path, 'w') as file_handle:
|
||||
csv_writer = csv.writer(file_handle)
|
||||
csv_writer.writerow(self._header)
|
||||
for row_data in self._result:
|
||||
csv_writer.writerow(row_data)
|
||||
os.chmod(self._output_path, stat.S_IREAD)
|
|
@ -0,0 +1,417 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Profiling api file."""
|
||||
import os
|
||||
import time
|
||||
|
||||
from mindspore import log as logger, context
|
||||
from mindspore.communication.management import release
|
||||
from mindspore.profiler.common.exceptions.exceptions import ProfilerFileNotFoundException, \
|
||||
ProfilerIOException, ProfilerException
|
||||
from mindspore.profiler.common.util import get_file_names, fwrite_format
|
||||
from mindspore.profiler.common.validator.checkparam import \
|
||||
check_bool, check_subgraph
|
||||
from mindspore.profiler.common.validator.validate_path import \
|
||||
validate_and_normalize_path
|
||||
from mindspore.profiler.parser.aicpu_data_parser import DataPreProcessParser
|
||||
from mindspore.profiler.parser.framework_parser import FrameworkParser
|
||||
from mindspore.profiler.parser.hwts_log_parser import HWTSLogParser
|
||||
from mindspore.profiler.parser.integrator import Integrator
|
||||
from mindspore.profiler.parser.integrator import TimelineAnalyser
|
||||
from mindspore.profiler.parser.minddata_parser import MinddataParser
|
||||
from mindspore.profiler.parser.minddata_pipeline_parser import \
|
||||
MinddataPipelineParser
|
||||
from mindspore.profiler.parser.optime_parser import OPComputeTimeParser
|
||||
from mindspore.profiler.parser.step_trace_parser import StepTraceParser
|
||||
|
||||
PROFILING_LOG_BASE_PATH = "/var/log/npu/profiling"
|
||||
INIT_OP_NAME = 'Default/InitDataSetQueue'
|
||||
|
||||
|
||||
class Profiler:
|
||||
"""
|
||||
Performance profiling API.
|
||||
|
||||
Enable MindSpore users to profile the performance of neural network.
|
||||
|
||||
Args:
|
||||
subgraph (str): Define which subgraph to monitor and analyse, can be 'all', 'Default', 'Gradients'.
|
||||
is_detail (bool): Whether to show profiling data for op_instance level, only show optype level if False.
|
||||
is_show_op_path (bool): Whether to save the full path for each op instance.
|
||||
output_path (str): Output data path.
|
||||
optypes_to_deal (str): Op type names, the data of which optype should be collected and analysed,
|
||||
will deal with all op if null; Different op types should be seperated by comma.
|
||||
optypes_not_deal (str): Op type names, the data of which optype will not be collected and analysed;
|
||||
Different op types should be seperated by comma.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.profiler import Profiler
|
||||
>>> import mindspore.context
|
||||
>>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
|
||||
>>> device_id=int(os.environ["DEVICE_ID"]))
|
||||
>>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
|
||||
>>> model = Model()
|
||||
>>> model.train()
|
||||
>>> profiler.analyse()
|
||||
"""
|
||||
|
||||
_base_profiling_container_path = "/var/log/npu/profiling/container"
|
||||
_hwts_output_filename_target = "output_format_data_hwts_"
|
||||
_opcompute_output_filename_target = "output_op_compute_time_"
|
||||
_aicpu_op_output_filename_target = "output_data_preprocess_aicpu_"
|
||||
|
||||
def __init__(self, subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data',
|
||||
optypes_to_deal='', optypes_not_deal='Variable', job_id=""):
|
||||
# get device_id and device_target
|
||||
self._get_devid_and_devtarget()
|
||||
self._container_path = os.path.join(self._base_profiling_container_path, self._dev_id)
|
||||
data_path = os.path.join(self._container_path, "data")
|
||||
if not os.path.exists(data_path):
|
||||
os.makedirs(data_path, exist_ok=True)
|
||||
self._output_path = validate_and_normalize_path(output_path)
|
||||
self._output_path = os.path.join(self._output_path, "profiler")
|
||||
if not os.path.exists(self._output_path):
|
||||
os.makedirs(self._output_path, exist_ok=True)
|
||||
|
||||
os.environ['PROFILING_MODE'] = 'true'
|
||||
os.environ['PROFILING_OPTIONS'] = 'training_trace:task_trace'
|
||||
os.environ['MINDDATA_PROFILING_DIR'] = self._output_path
|
||||
os.environ['DEVICE_ID'] = self._dev_id
|
||||
os.environ['AICPU_PROFILING_MODE'] = 'true'
|
||||
os.environ['PROFILING_DIR'] = str(self._container_path)
|
||||
|
||||
# use context interface to open profiling, for the new mindspore version(after 2020.5.21)
|
||||
context.set_context(enable_profiling=True, profiling_options="training_trace:task_trace")
|
||||
|
||||
self._subgraph = check_subgraph(subgraph)
|
||||
self._valid_optype_name = optypes_to_deal.split(",") if optypes_to_deal else []
|
||||
self._filt_optype_names = optypes_not_deal.split(",") if optypes_not_deal else []
|
||||
self._detail = check_bool(is_detail, 'is_detail')
|
||||
self._withfullpath = check_bool(is_show_op_path, 'is_show_op_path')
|
||||
self._profiling_job_id = job_id
|
||||
# add job id env through user input later
|
||||
self._job_id_env = 0
|
||||
self._start_time = int(time.time() * 10000000)
|
||||
logger.info("Profiling: profiling start time: %d", self._start_time)
|
||||
|
||||
def analyse(self):
|
||||
"""
|
||||
Collect and analyse performance data, called after training or during training.
|
||||
|
||||
Examples:
|
||||
>>> from mindspore.profiler import Profiler
|
||||
>>> import mindspore.context
|
||||
>>> context.set_context(mode=context.GRAPH_MODE, device_target="Ascend",
|
||||
>>> device_id=int(os.environ["DEVICE_ID"]))
|
||||
>>> profiler = Profiler(subgraph='all', is_detail=True, is_show_op_path=False, output_path='./data')
|
||||
>>> model = Model()
|
||||
>>> model.train()
|
||||
>>> profiler.analyse()
|
||||
"""
|
||||
release()
|
||||
|
||||
job_id = self._get_profiling_job_id()
|
||||
logger.info("Profiling: job id is %s ", job_id)
|
||||
|
||||
source_path = os.path.join(PROFILING_LOG_BASE_PATH, job_id)
|
||||
# parse hwts.log.data.45.dev file, and get task profiling data
|
||||
hwts_output_filename = self._hwts_output_filename_target + self._dev_id + ".txt"
|
||||
hwts_output_filename = os.path.join(self._output_path, hwts_output_filename)
|
||||
hwtslog_parser = HWTSLogParser(source_path, hwts_output_filename)
|
||||
result = hwtslog_parser.execute()
|
||||
if not result:
|
||||
logger.error("Profiling: fail to parse hwts log file.")
|
||||
return
|
||||
|
||||
# parse Framework file, and get the relation of op and tasks
|
||||
framework_parser = FrameworkParser(job_id, self._dev_id, self._output_path)
|
||||
framework_parser.parse()
|
||||
op_task_dict = framework_parser.to_task_id_full_op_name_dict()
|
||||
if not op_task_dict:
|
||||
logger.error("Profiling: fail to parse framework files.")
|
||||
return
|
||||
|
||||
# get op compute time from hwts data and framework data, write output_op_compute_time.txt
|
||||
opcompute_output_filename = self._opcompute_output_filename_target + self._dev_id + ".txt"
|
||||
opcompute_output_filename = os.path.join(self._output_path, opcompute_output_filename)
|
||||
optime_parser = OPComputeTimeParser(
|
||||
hwts_output_filename, opcompute_output_filename,
|
||||
op_task_dict, self._output_path, self._dev_id
|
||||
)
|
||||
optime_parser.execute()
|
||||
|
||||
# parse DATA_PREPROCESS.dev.AICPU file, write output_data_preprocess_aicpu_x.txt
|
||||
output_data_preprocess_aicpu = self._aicpu_op_output_filename_target + self._dev_id + ".txt"
|
||||
output_data_preprocess_aicpu = os.path.join(self._output_path, output_data_preprocess_aicpu)
|
||||
aicpu_data_parser = DataPreProcessParser(source_path, output_data_preprocess_aicpu)
|
||||
aicpu_data_parser.execute()
|
||||
|
||||
# Parsing minddata AICPU profiling
|
||||
MinddataParser.execute(source_path, self._output_path, self._dev_id)
|
||||
|
||||
# parse minddata pipeline operator and queue
|
||||
try:
|
||||
pipeline_parser = MinddataPipelineParser(self._output_path, self._dev_id, self._output_path)
|
||||
pipeline_parser.parse()
|
||||
except ProfilerException as err:
|
||||
logger.warning(err.message)
|
||||
|
||||
# analyse op compute time info
|
||||
try:
|
||||
self._analyser_op_info()
|
||||
except ProfilerException as err:
|
||||
logger.warning(err.message)
|
||||
|
||||
# analyse step trace info
|
||||
try:
|
||||
self._analyse_step_trace(source_path, framework_parser)
|
||||
except ProfilerException as err:
|
||||
logger.warning(err.message)
|
||||
|
||||
# analyse timeline info
|
||||
try:
|
||||
self._analyse_timeline(aicpu_data_parser, optime_parser)
|
||||
except (ProfilerIOException, ProfilerFileNotFoundException, RuntimeError) as err:
|
||||
logger.warning('Fail to write timeline data: %s', err)
|
||||
|
||||
def _analyse_step_trace(self, source_path, framework_parser):
|
||||
"""
|
||||
Analyse step trace data and save the result.
|
||||
|
||||
Args:
|
||||
source_path (str): The directory that contains the step trace original data.
|
||||
framework_parser (FrameworkParser): The framework parse instance.
|
||||
"""
|
||||
logger.info("Begin to parse step trace.")
|
||||
# construct output path
|
||||
step_trace_intermediate_file_path = os.path.join(
|
||||
self._output_path,
|
||||
f'step_trace_raw_{self._dev_id}_detail_time.csv'
|
||||
)
|
||||
point_info_file_path = os.path.join(
|
||||
self._output_path,
|
||||
'step_trace_point_info.json'
|
||||
)
|
||||
# whether keep the first step
|
||||
skip_first_step_flag = framework_parser.check_op_name(INIT_OP_NAME)
|
||||
point_info = framework_parser.point_info
|
||||
# parser the step trace files and save the result to disk
|
||||
parser = StepTraceParser(input_dir=source_path,
|
||||
output_file_path=step_trace_intermediate_file_path,
|
||||
job_id=self._job_id_env,
|
||||
skip_first_step=skip_first_step_flag)
|
||||
parser.update_tag_op_type_map(point_info)
|
||||
parser.parse_and_save()
|
||||
point_info = parser.record_point_info(point_info, point_info_file_path)
|
||||
# print parser result
|
||||
parser.show()
|
||||
logger.info("Finish saving the intermediate result: %s", step_trace_intermediate_file_path)
|
||||
logger.info("The point info is: %s", point_info)
|
||||
|
||||
def _analyse_timeline(self, aicpu_parser, optime_parser):
|
||||
"""
|
||||
Analyse and parse timeline info.
|
||||
|
||||
Args:
|
||||
aicpu_parser (DataPreProcessParser): The parser instance for AI CPU operator
|
||||
execution time calculation.
|
||||
optime_parser (OPComputeTimeParserParser): The parser instance for AI Core
|
||||
operator execution time calculation.
|
||||
"""
|
||||
timeline_analyser = TimelineAnalyser(self._output_path, self._dev_id)
|
||||
# Get framework info
|
||||
integrator = Integrator(self._output_path, self._dev_id)
|
||||
aicore_detail_data = integrator.get_aicore_detail_data()
|
||||
aicore_detail_data_size = len(aicore_detail_data)
|
||||
col_names = ['op_name', 'op_type', 'avg_execution_time', 'subgraph',
|
||||
'full_op_name', 'op_info']
|
||||
framework_info = {
|
||||
'col_name': col_names,
|
||||
'object': aicore_detail_data,
|
||||
'size': aicore_detail_data_size
|
||||
}
|
||||
|
||||
all_reduce_info = integrator.query_for_all_reduce()
|
||||
|
||||
# Get timeline info
|
||||
logger.info('Start writing timeline info...')
|
||||
logger.info('Warm Prompt: It could take a few minutes if you are training '
|
||||
'with a complex network or more than 10 steps.')
|
||||
# Add info into timeline, such as AI CPU, AllReduce, framework info.
|
||||
aicpu_info = aicpu_parser.query_aicpu_data()
|
||||
min_cycle_counter = min(aicpu_parser.min_cycle_counter, optime_parser.min_cycle_counter)
|
||||
timeline_analyser.init_timeline(all_reduce_info, framework_info, aicpu_info, min_cycle_counter)
|
||||
timeline_analyser.write_timeline()
|
||||
timeline_analyser.write_timeline_summary()
|
||||
|
||||
def __del__(self):
|
||||
"""Disable the profiling collection service, called after training."""
|
||||
os.environ['PROFILING_MODE'] = str("false")
|
||||
context.set_context(enable_profiling=False)
|
||||
|
||||
def _get_profiling_job_id(self):
|
||||
"""Get profiling job id, which was generated by ada service.
|
||||
|
||||
Returns:
|
||||
str: profiling jon id.
|
||||
"""
|
||||
|
||||
if self._profiling_job_id:
|
||||
return self._profiling_job_id
|
||||
|
||||
job_id = ""
|
||||
cmd = "ls -t " + PROFILING_LOG_BASE_PATH + "|grep JOB|awk '{print $1}'"
|
||||
r = os.popen(cmd)
|
||||
profiling_job_dirs = r.readlines()
|
||||
r.close()
|
||||
for item in profiling_job_dirs:
|
||||
path = os.path.join(PROFILING_LOG_BASE_PATH, item.strip())
|
||||
log_file = get_file_names(path, "host_start.log")
|
||||
if not log_file:
|
||||
logger.error("Profiling: job path %s, host_start.log not exist.", path)
|
||||
continue
|
||||
|
||||
log_file = os.path.join(path, log_file[0])
|
||||
item_dict = self._parse_host_start_log(log_file)
|
||||
|
||||
if not item_dict:
|
||||
logger.error("Profiling: job path %s, fail to get job start info.", path)
|
||||
continue
|
||||
if self._start_time > int(item_dict["start_time"]):
|
||||
logger.info("Profiling: job path %s, start_time %s, training start_time %d.",
|
||||
path, item_dict["start_time"], self._start_time)
|
||||
break
|
||||
|
||||
if self._dev_id != item_dict["device_id"]:
|
||||
logger.info("Profiling: job path %s, dev id %s, training device id %s.",
|
||||
path, item_dict["device_id"], self._dev_id)
|
||||
continue
|
||||
|
||||
job_id = item.strip()
|
||||
break
|
||||
|
||||
if not job_id:
|
||||
msg = "Fail to get profiling job, please check whether job dir was generated"
|
||||
raise RuntimeError(msg)
|
||||
|
||||
return job_id
|
||||
|
||||
def _parse_host_start_log(self, input_file):
|
||||
"""
|
||||
Parse host start log file, get the device id and start time of the job.
|
||||
|
||||
Args:
|
||||
input_file (str): The file path of the host start log file.
|
||||
|
||||
Returns:
|
||||
dict, job start time and device id.
|
||||
"""
|
||||
|
||||
item_dict = {}
|
||||
for line in open(input_file):
|
||||
if "Device" in line:
|
||||
item_dict["device_id"] = line[7:len(line)-2]
|
||||
elif "clock_realtime" in line:
|
||||
item_dict["start_time"] = line[16:len(line)-3]
|
||||
|
||||
return item_dict
|
||||
|
||||
def _analyser_op_info(self):
|
||||
"""Analyse the operator information."""
|
||||
integrator = Integrator(self._output_path, self._dev_id)
|
||||
integrator.integrate()
|
||||
|
||||
aicore_type_result = self._query_op_type_info()
|
||||
detail_file_path = os.path.join(
|
||||
self._output_path,
|
||||
'output_op_compute_time_detail_{}.txt'.format(self._dev_id)
|
||||
)
|
||||
fwrite_format(detail_file_path, data_source='title:op compute time')
|
||||
display_names = [
|
||||
'optype_name', 'compute_time(ms, per-step)',
|
||||
'called_times(per-step)', 'percent'
|
||||
]
|
||||
fwrite_format(detail_file_path, data_source=" ".join(display_names), is_print=True)
|
||||
fwrite_format(detail_file_path, data_source=aicore_type_result, is_print=True)
|
||||
|
||||
if self._detail:
|
||||
op_type_order = [item[0] for item in aicore_type_result]
|
||||
aicore_detail_result = self._query_op_detail_info(op_type_order)
|
||||
|
||||
fwrite_format(detail_file_path, data_source='', is_print=True)
|
||||
fwrite_format(detail_file_path, data_source='Detail:', is_print=True)
|
||||
col_names = ['op_name', 'op_type', 'avg_execution_time', 'subgraph',
|
||||
'full_op_name', 'op_info']
|
||||
fwrite_format(detail_file_path, data_source=" ".join(col_names), is_print=True)
|
||||
fwrite_format(detail_file_path, data_source=aicore_detail_result, is_print=True)
|
||||
|
||||
def _query_op_type_info(self):
|
||||
"""
|
||||
Query AICORE operator type information.
|
||||
|
||||
Returns:
|
||||
list[list], the AICORE operator type and execution time information.
|
||||
"""
|
||||
integrator = Integrator(self._output_path, self._dev_id)
|
||||
return integrator.get_aicore_data()
|
||||
|
||||
def _query_op_detail_info(self, op_type_order):
|
||||
"""
|
||||
Query AICORE operator detail information.
|
||||
|
||||
Args:
|
||||
op_type_order(list): The name of the op type in order.
|
||||
|
||||
Returns:
|
||||
dict, the AICORE operator detail information.
|
||||
"""
|
||||
|
||||
op_type_condition = {}
|
||||
if self._valid_optype_name:
|
||||
op_type_condition['in'] = self._valid_optype_name
|
||||
if self._filt_optype_names:
|
||||
op_type_condition['not_in'] = self._filt_optype_names
|
||||
|
||||
subgraph_condition = {}
|
||||
if self._subgraph != 'all':
|
||||
subgraph_condition['in'] = [self._subgraph]
|
||||
|
||||
integrator = Integrator(self._output_path, self._dev_id)
|
||||
return integrator.get_aicore_detail_data()
|
||||
|
||||
|
||||
def _get_devid_and_devtarget(self):
|
||||
"""Get device id and target of this training."""
|
||||
|
||||
device_target = ""
|
||||
dev_id = ""
|
||||
try:
|
||||
dev_id = str(context.get_context("device_id"))
|
||||
device_target = context.get_context("device_target")
|
||||
except ValueError as err:
|
||||
logger.error("Profiling: fail to get context, %s", err)
|
||||
|
||||
if not dev_id or not dev_id.isdigit():
|
||||
dev_id = os.getenv('DEVICE_ID')
|
||||
if not dev_id or not dev_id.isdigit():
|
||||
dev_id = "0"
|
||||
logger.error("Fail to get DEVICE_ID, use 0 instead.")
|
||||
|
||||
if device_target and device_target != "Davinci" \
|
||||
and device_target != "Ascend":
|
||||
msg = "Profiling: unsupport backend: %s" % device_target
|
||||
raise RuntimeError(msg)
|
||||
|
||||
self._dev_id = dev_id
|
Loading…
Reference in New Issue