add hccl_tools

Signed-off-by: GuoMengHao <guomenghao@huawei.com>
This commit is contained in:
GuoMengHao 2020-07-02 11:32:34 +08:00
parent bc42685436
commit ab90f30a2b
2 changed files with 179 additions and 0 deletions

View File

@ -0,0 +1,14 @@
# description
mindspore distributed training launch helper utilty that will generate hccl config file.
# use
```
python hccl_tools.py --device_num [1,8]
```
output:
```
hccl_[device_num]p_[which device]_[server_ip].json
```

View File

@ -0,0 +1,165 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""generate hccl config file script"""
import os
import sys
import json
import socket
import platform
from argparse import ArgumentParser
from typing import Dict, Any
def parse_args():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
parser = ArgumentParser(description="mindspore distributed training launch "
"helper utilty that will generate hccl"
" config file")
parser.add_argument("--device_num", type=str, default="[0,8]",
help="The number of the D chip used. please note that the D chips"
"used must be continuous, such [0,4] means to use four chips "
"0123; [0,1] means to use chip 0; The first four chips are"
"a group, and the last four chips are a group. In addition to"
"the [0,8] chips are allowed, other cross-group such as [3,6]"
"are prohibited.")
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
help="will use the visible devices sequentially")
parser.add_argument("--server_ip", type=str, default="",
help="server ip")
args = parser.parse_args()
return args
def get_host_ip():
"""
get host ip
"""
ip = None
try:
hostname = socket.gethostname()
ip = socket.gethostbyname(hostname)
except EOFError:
pass
return ip
def main():
print("start", __file__)
args = parse_args()
# visible_devices
visible_devices = args.visible_devices.split(',')
print('visible_devices:{}'.format(visible_devices))
# server_id
ip = get_host_ip()
if args.server_ip:
server_id = args.server_ip
elif ip:
server_id = ip
else:
raise ValueError("please input server ip!")
print('server_id:{}'.format(server_id))
# device_num
first_num = int(args.device_num[1])
last_num = int(args.device_num[3])
if first_num < 0 or last_num > 8:
raise ValueError("device num {} must be in range [0,8] !".format(args.device_num))
if first_num > last_num:
raise ValueError("First num {} of device num {} must less than last num {} !".format(first_num, args.device_num,
last_num))
if first_num < 4:
if last_num > 4:
if first_num == 0 and last_num == 8:
pass
else:
raise ValueError("device num {} must be in the same group of [0,4] or [4,8] !".format(args.device_num))
device_num_list = list(range(first_num, last_num))
print("device_num_list:", device_num_list)
assert len(visible_devices) >= len(device_num_list)
# construct hccn_table
device_ips: Dict[Any, Any] = {}
with open('/etc/hccn.conf', 'r') as fin:
for hccn_item in fin.readlines():
if hccn_item.strip().startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip.strip()
arch = platform.processor()
hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch],
'chip_info': '910',
'deploy_mode': 'lab',
'group_count': '1',
'group_list': []}
instance_list = []
rank_id = 0
for instance_id in device_num_list:
instance = {'devices': []}
device_id = visible_devices[instance_id]
device_ip = device_ips[device_id]
instance['devices'].append({
'device_id': device_id,
'device_ip': device_ip,
})
print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
instance['rank_id'] = str(rank_id)
rank_id += 1
instance['server_id'] = server_id
instance_list.append(instance)
hccn_table['group_list'].append({
'device_num': str(len(device_num_list)),
'server_num': '1',
'group_name': '',
'instance_count': str(len(device_num_list)),
'instance_list': instance_list,
})
hccn_table['para_plane_nic_location'] = 'device'
hccn_table['para_plane_nic_name'] = []
for instance_id in device_num_list:
eth_id = visible_devices[instance_id]
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
hccn_table['para_plane_nic_num'] = str(len(device_num_list))
hccn_table['status'] = 'completed'
# save hccn_table to file
table_path = os.getcwd()
table_fn = os.path.join(table_path,
'hccl_{}p_{}_{}.json'.format(len(device_num_list), "".join(map(str, device_num_list)),
server_id))
with open(table_fn, 'w') as table_fp:
json.dump(hccn_table, table_fp, indent=4)
sys.stdout.flush()
print("Completed: hccl file was save in :", table_fn)
if __name__ == "__main__":
main()