forked from mindspore-Ecosystem/mindspore
!2808 add_hccl_tools
Merge pull request !2808 from Guomenghao319/add_hccl_tools
This commit is contained in:
commit
48e9d60a1f
|
@ -0,0 +1,14 @@
|
|||
# description
|
||||
|
||||
mindspore distributed training launch helper utilty that will generate hccl config file.
|
||||
|
||||
# use
|
||||
|
||||
```
|
||||
python hccl_tools.py --device_num [1,8]
|
||||
```
|
||||
|
||||
output:
|
||||
```
|
||||
hccl_[device_num]p_[which device]_[server_ip].json
|
||||
```
|
|
@ -0,0 +1,165 @@
|
|||
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""generate hccl config file script"""
|
||||
import os
|
||||
import sys
|
||||
import json
|
||||
import socket
|
||||
import platform
|
||||
from argparse import ArgumentParser
|
||||
from typing import Dict, Any
|
||||
|
||||
|
||||
def parse_args():
|
||||
"""
|
||||
parse args .
|
||||
|
||||
Args:
|
||||
|
||||
Returns:
|
||||
args.
|
||||
|
||||
Examples:
|
||||
>>> parse_args()
|
||||
"""
|
||||
parser = ArgumentParser(description="mindspore distributed training launch "
|
||||
"helper utilty that will generate hccl"
|
||||
" config file")
|
||||
parser.add_argument("--device_num", type=str, default="[0,8]",
|
||||
help="The number of the D chip used. please note that the D chips"
|
||||
"used must be continuous, such [0,4] means to use four chips "
|
||||
"0,1,2,3; [0,1] means to use chip 0; The first four chips are"
|
||||
"a group, and the last four chips are a group. In addition to"
|
||||
"the [0,8] chips are allowed, other cross-group such as [3,6]"
|
||||
"are prohibited.")
|
||||
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
|
||||
help="will use the visible devices sequentially")
|
||||
parser.add_argument("--server_ip", type=str, default="",
|
||||
help="server ip")
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def get_host_ip():
|
||||
"""
|
||||
get host ip
|
||||
"""
|
||||
ip = None
|
||||
|
||||
try:
|
||||
hostname = socket.gethostname()
|
||||
ip = socket.gethostbyname(hostname)
|
||||
except EOFError:
|
||||
pass
|
||||
|
||||
return ip
|
||||
|
||||
|
||||
def main():
|
||||
print("start", __file__)
|
||||
args = parse_args()
|
||||
|
||||
# visible_devices
|
||||
visible_devices = args.visible_devices.split(',')
|
||||
print('visible_devices:{}'.format(visible_devices))
|
||||
|
||||
# server_id
|
||||
ip = get_host_ip()
|
||||
if args.server_ip:
|
||||
server_id = args.server_ip
|
||||
elif ip:
|
||||
server_id = ip
|
||||
else:
|
||||
raise ValueError("please input server ip!")
|
||||
print('server_id:{}'.format(server_id))
|
||||
|
||||
# device_num
|
||||
first_num = int(args.device_num[1])
|
||||
last_num = int(args.device_num[3])
|
||||
if first_num < 0 or last_num > 8:
|
||||
raise ValueError("device num {} must be in range [0,8] !".format(args.device_num))
|
||||
if first_num > last_num:
|
||||
raise ValueError("First num {} of device num {} must less than last num {} !".format(first_num, args.device_num,
|
||||
last_num))
|
||||
if first_num < 4:
|
||||
if last_num > 4:
|
||||
if first_num == 0 and last_num == 8:
|
||||
pass
|
||||
else:
|
||||
raise ValueError("device num {} must be in the same group of [0,4] or [4,8] !".format(args.device_num))
|
||||
|
||||
device_num_list = list(range(first_num, last_num))
|
||||
print("device_num_list:", device_num_list)
|
||||
|
||||
assert len(visible_devices) >= len(device_num_list)
|
||||
|
||||
# construct hccn_table
|
||||
device_ips: Dict[Any, Any] = {}
|
||||
with open('/etc/hccn.conf', 'r') as fin:
|
||||
for hccn_item in fin.readlines():
|
||||
if hccn_item.strip().startswith('address_'):
|
||||
device_id, device_ip = hccn_item.split('=')
|
||||
device_id = device_id.split('_')[1]
|
||||
device_ips[device_id] = device_ip.strip()
|
||||
|
||||
arch = platform.processor()
|
||||
hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch],
|
||||
'chip_info': '910',
|
||||
'deploy_mode': 'lab',
|
||||
'group_count': '1',
|
||||
'group_list': []}
|
||||
instance_list = []
|
||||
rank_id = 0
|
||||
for instance_id in device_num_list:
|
||||
instance = {'devices': []}
|
||||
device_id = visible_devices[instance_id]
|
||||
device_ip = device_ips[device_id]
|
||||
instance['devices'].append({
|
||||
'device_id': device_id,
|
||||
'device_ip': device_ip,
|
||||
})
|
||||
print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
|
||||
instance['rank_id'] = str(rank_id)
|
||||
rank_id += 1
|
||||
instance['server_id'] = server_id
|
||||
instance_list.append(instance)
|
||||
hccn_table['group_list'].append({
|
||||
'device_num': str(len(device_num_list)),
|
||||
'server_num': '1',
|
||||
'group_name': '',
|
||||
'instance_count': str(len(device_num_list)),
|
||||
'instance_list': instance_list,
|
||||
})
|
||||
hccn_table['para_plane_nic_location'] = 'device'
|
||||
hccn_table['para_plane_nic_name'] = []
|
||||
for instance_id in device_num_list:
|
||||
eth_id = visible_devices[instance_id]
|
||||
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
|
||||
hccn_table['para_plane_nic_num'] = str(len(device_num_list))
|
||||
hccn_table['status'] = 'completed'
|
||||
|
||||
# save hccn_table to file
|
||||
table_path = os.getcwd()
|
||||
table_fn = os.path.join(table_path,
|
||||
'hccl_{}p_{}_{}.json'.format(len(device_num_list), "".join(map(str, device_num_list)),
|
||||
server_id))
|
||||
with open(table_fn, 'w') as table_fp:
|
||||
json.dump(hccn_table, table_fp, indent=4)
|
||||
sys.stdout.flush()
|
||||
print("Completed: hccl file was save in :", table_fn)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
main()
|
Loading…
Reference in New Issue