!22284 Change Op name in hccl to Op name in step trace

Merge pull request !22284 from 张毅辉/op_name_of_hccl_to_op_name_of_step_trace
This commit is contained in:
i-robot 2021-08-25 09:32:55 +00:00 committed by Gitee
commit acee9b24bc
6 changed files with 137 additions and 10 deletions

View File

@ -72,7 +72,8 @@ class HcclParser:
self._dev_id = device_id
self._source_dir = source_dir
self._save_path = self._get_save_path(output_path)
self._step_timestamps_info = self._get_step_timestamps_info(output_path)
self._step_trace_info = self._get_step_trace_info(output_path)
self._communication_operator_name_mapping_info = self._get_communication_operator_name_mapping_info()
def parse(self):
"""Parse communication info."""
@ -138,8 +139,8 @@ class HcclParser:
output_path, self._parsed_hccl_file_name.format(self._dev_id)
)
def _get_step_timestamps_info(self, source_dir):
"""Get the start and end timestamps in a step."""
def _get_step_trace_info(self, source_dir):
"""Get the start and end timestamps in a step and communication operators names."""
file_path = os.path.join(
source_dir,
f'step_trace_raw_{self._dev_id}_detail_time.csv'
@ -155,23 +156,61 @@ class HcclParser:
with open(file_path, 'r') as src_file:
csv_reader = csv.reader(src_file)
# The first row of step trace file is like: step_num, start_point,...,communication_operator_name.
# The position number of the first communication operator name is 9.
communication_operators_names = next(csv_reader)[9:]
# index_0:step_num, index_1:start_point, index_2:end_point
# The unit of time stamp is 10ns. To convert it to μs, you need to divide it by 100.
step_timestamps_info = [[info[0], float(info[1]) / 100, float(info[2]) / 100]
for info in csv_reader if info[0].isdigit()]
return step_timestamps_info
return [communication_operators_names, step_timestamps_info]
def _get_communication_operator_name_mapping_info(self):
"""Get the name of communication operators mapping between hccl and step trace."""
dir_path = self._validate_dir_path(self._source_dir)
# The name of the operator in hccl is likeoperatorName_{Ordered_number}_xx_xx.
operators_names_in_hccl = [entry.name for entry in os.scandir(dir_path) if entry.is_dir()]
operators_names_in_hccl_set = set({i.split('_')[0] for i in operators_names_in_hccl})
op_names_in_hccl_dic = dict()
for item in operators_names_in_hccl_set:
op_names_in_hccl_dic[item] = sorted([i for i in operators_names_in_hccl if i.split('_')[0] == item],
key=lambda x: int(x.split('_')[1]))
# The op_info in step trace is like:[op_name,op_name_start_point,op_name_end_point]
# The name of the operator in step trace can be obtained every three.
# The name of the operator in step trace is like: stream_xx_xx_operatorName-opxx.
operators_names_in_step_trace = [self._step_trace_info[0][i]
for i in range(0, len(self._step_trace_info[0]), 3)]
op_names_in_step_trace_set = set({i.split('_')[3].split('-')[0] for i in operators_names_in_step_trace})
op_names_in_step_trace_dic = dict()
for item in op_names_in_step_trace_set:
op_names_in_step_trace_dic[item] = [i for i in operators_names_in_step_trace
if i.split('_')[3].split('-')[0] == item]
communication_operator_mapping_info = dict()
for hccl_key, hccl_value in op_names_in_hccl_dic.items():
for step_trace_key, step_trace_value in op_names_in_step_trace_dic.items():
if hccl_key.lower() == step_trace_key.lower():
communication_operator_mapping_info[hccl_key] = list(zip(hccl_value, step_trace_value))
logger.info("Communication operator name mapping info is %s", communication_operator_mapping_info)
return communication_operator_mapping_info
def _calculate_the_step_by_timestamp(self, timestamp):
"""Calculate the step according to the timestamp."""
step_timestamps_len = len(self._step_timestamps_info)
# index0:communication_operator_name, index1:step_timestamps_info
step_timestamps_info = self._step_trace_info[1]
step_timestamps_len = len(step_timestamps_info)
# index_0:step_num, index_1:start_point, index_2:end_point
if timestamp < self._step_timestamps_info[0][1]:
step_num = 1
elif self._step_timestamps_info[step_timestamps_len - 1][2] < timestamp:
step_num = self._step_timestamps_info[step_timestamps_len - 1][0]
if timestamp < step_timestamps_info[0][1]:
step_num = "1"
elif step_timestamps_info[step_timestamps_len - 1][2] < timestamp:
step_num = step_timestamps_info[step_timestamps_len - 1][0]
else:
for item in self._step_timestamps_info:
for item in step_timestamps_info:
if item[1] <= timestamp < item[2]:
step_num = item[0]
return step_num
@ -185,6 +224,14 @@ class HcclParser:
for operator_dir in operator_dir_path:
operator_cost = self._calculate_communication_operator_cost(operator_dir)
operator_name = os.path.basename(operator_dir)
op_mapping_info = self._communication_operator_name_mapping_info.get(operator_name.split('_')[0], [])
# index1: operator name in step trace.
op_mapping_name = [item[1] for item in op_mapping_info if item[0] == operator_name]
if not op_mapping_name:
logger.warning("The mapping relationship between op name in hccl and op name in step trace "
"cannot be found. Use op name in hccl to show the name of the communication operator.")
else:
operator_name = op_mapping_name[0]
operators_cost_info[operator_name] = operator_cost
return operators_cost_info

File diff suppressed because one or more lines are too long

File diff suppressed because one or more lines are too long

View File

@ -0,0 +1,4 @@
step_num,communication_cost,wait_cost,link_info,communication_operator_cost
1,4.51637,4e-05,"{""1-0"": {""SDMA"": [4.36566, 94356.992, 21613454.09399724]}, ""0-0"": {""SDMA"": [0.15071, 47178.496, 313041576.53772146]}}","{""stream_24_0_AllReduce-op3143"": [""1"", 4.51637, 4e-05, {""1-0"": {""SDMA"": [4.36566, 94356.992, 21613454.09399724]}, ""0-0"": {""SDMA"": [0.15071, 47178.496, 313041576.53772146]}}]}"
2,4.519260000000001,7.000000000000001e-05,"{""1-0"": {""SDMA"": [4.368450000000001, 94356.992, 21599650.21918529]}, ""0-0"": {""SDMA"": [0.15081, 47178.496, 312834003.0501956]}}","{""stream_24_0_AllReduce-op3143"": [""2"", 4.519260000000001, 7.000000000000001e-05, {""1-0"": {""SDMA"": [4.368450000000001, 94356.992, 21599650.21918529]}, ""0-0"": {""SDMA"": [0.15081, 47178.496, 312834003.0501956]}}]}"
-,4.517815000000001,5.500000000000001e-05,"{""1-0"": {""SDMA"": [4.367055000000001, 94356.992, 21606552.156591266]}, ""0-0"": {""SDMA"": [0.15076, 47178.496, 312937789.79395854]}}","{""stream_24_0_AllReduce-op3143"": [""-"", 4.517815000000001, 5.500000000000001e-05, {""1-0"": {""SDMA"": [4.367055000000001, 94356.992, 21606552.156591266]}, ""0-0"": {""SDMA"": [0.15076, 47178.496, 312937789.79395854]}}]}"
1 step_num communication_cost wait_cost link_info communication_operator_cost
2 1 4.51637 4e-05 {"1-0": {"SDMA": [4.36566, 94356.992, 21613454.09399724]}, "0-0": {"SDMA": [0.15071, 47178.496, 313041576.53772146]}} {"stream_24_0_AllReduce-op3143": ["1", 4.51637, 4e-05, {"1-0": {"SDMA": [4.36566, 94356.992, 21613454.09399724]}, "0-0": {"SDMA": [0.15071, 47178.496, 313041576.53772146]}}]}
3 2 4.519260000000001 7.000000000000001e-05 {"1-0": {"SDMA": [4.368450000000001, 94356.992, 21599650.21918529]}, "0-0": {"SDMA": [0.15081, 47178.496, 312834003.0501956]}} {"stream_24_0_AllReduce-op3143": ["2", 4.519260000000001, 7.000000000000001e-05, {"1-0": {"SDMA": [4.368450000000001, 94356.992, 21599650.21918529]}, "0-0": {"SDMA": [0.15081, 47178.496, 312834003.0501956]}}]}
4 - 4.517815000000001 5.500000000000001e-05 {"1-0": {"SDMA": [4.367055000000001, 94356.992, 21606552.156591266]}, "0-0": {"SDMA": [0.15076, 47178.496, 312937789.79395854]}} {"stream_24_0_AllReduce-op3143": ["-", 4.517815000000001, 5.500000000000001e-05, {"1-0": {"SDMA": [4.367055000000001, 94356.992, 21606552.156591266]}, "0-0": {"SDMA": [0.15076, 47178.496, 312937789.79395854]}}]}

View File

@ -0,0 +1,4 @@
step_num,start_point,end_point,total,fp_point,bp_point,iteration_interval,fp_and_bp,tail,stream_24_0_AllReduce-op3143,stream_24_0_AllReduce-op3143_start_point,stream_24_0_AllReduce-op3143_end_point
1,61688106196804,61688109551051,3354247,61688106196804,61688109001447,0,2804643,549604,454080,61688109006157,61688109460237
2,61688109551051,61688112916194,3365143,61688109556119,61688112367356,5068,2811237,548838,454080,61688112371861,61688112825941
-,61688109551051,61688112916194,3365143,61688109556119,61688112367356,5068,2811237,548838,454080,61688112371861,61688112825941
1 step_num start_point end_point total fp_point bp_point iteration_interval fp_and_bp tail stream_24_0_AllReduce-op3143 stream_24_0_AllReduce-op3143_start_point stream_24_0_AllReduce-op3143_end_point
2 1 61688106196804 61688109551051 3354247 61688106196804 61688109001447 0 2804643 549604 454080 61688109006157 61688109460237
3 2 61688109551051 61688112916194 3365143 61688109556119 61688112367356 5068 2811237 548838 454080 61688112371861 61688112825941
4 - 61688109551051 61688112916194 3365143 61688109556119 61688112367356 5068 2811237 548838 454080 61688112371861 61688112825941

View File

@ -0,0 +1,70 @@
# Copyright 2021 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""Test the hccl parser module."""
import csv
import os
import shutil
import tempfile
from mindspore.profiler.parser.hccl_parser import HcclParser
from tests.ut.python.profiler import PROFILER_DIR
def get_hccl_result(file_path):
"""
Get hccl result from the hccl file.
Args:
file_path (str): The hccl file path.
Returns:
list[list], the parsed hccl information.
"""
result = []
with open(file_path, 'r') as file:
csv_reader = csv.reader(file)
for row in csv_reader:
result.append(row)
return result
class TestHcclParser:
"""Test the class of `HcclParser`."""
def setup_method(self):
"""Initialization before test case execution."""
self._output_path = tempfile.mkdtemp(
prefix='test_hccl_parser_'
)
shutil.copyfile(os.path.join(PROFILER_DIR, 'step_trace_raw_6_detail_time.csv'),
os.path.join(self._output_path, 'step_trace_raw_6_detail_time.csv'))
self._parser = HcclParser(os.path.join(PROFILER_DIR, 'hccl_info'), '6', self._output_path)
def teardown_method(self) -> None:
"""Clear up after test case execution."""
shutil.rmtree(self._output_path)
def test_parse(self):
"""Test the parse function."""
expect_hccl_file = os.path.join(
PROFILER_DIR, 'hccl_raw_6.csv'
)
expect_result = get_hccl_result(expect_hccl_file)
self._parser.parse()
hccl_file = os.path.join(
self._output_path, 'hccl_raw_6.csv'
)
result = get_hccl_result(hccl_file)
assert expect_result == result