add hccl_config

This commit is contained in:
wandongdong 2020-07-08 21:29:09 +08:00
parent fdc3a235e6
commit 74fcbd2900
3 changed files with 8 additions and 73 deletions

View File

@ -60,14 +60,14 @@ Dataset used: [imagenet](http://www.image-net.org/)
### Usage ### Usage
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH] - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
### Launch ### Launch
``` ```
# training example # training example
Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ mobilenet_199.ckpt
GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
``` ```

View File

@ -22,14 +22,16 @@ run_ascend()
exit 1 exit 1
fi fi
if [ ! -d $5 ] if [ ! -d $5 ] && [ ! -f $5 ]
then then
echo "error: DATASET_PATH=$5 is not a directory" echo "error: DATASET_PATH=$5 is not a directory or file"
exit 1 exit 1
fi fi
BASEPATH=$(cd "`dirname $0`" || exit; pwd) BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4
if [ -d "../train" ]; if [ -d "../train" ];
then then
rm -rf ../train rm -rf ../train
@ -38,8 +40,7 @@ run_ascend()
cd ../train || exit cd ../train || exit
python ${BASEPATH}/../src/launch.py \ python ${BASEPATH}/../src/launch.py \
--nproc_per_node=$2 \ --nproc_per_node=$2 \
--visible_devices=$4 \ --visible_devices=$3 \
--server_id=$3 \
--training_script=${BASEPATH}/../train.py \ --training_script=${BASEPATH}/../train.py \
--dataset_path=$5 \ --dataset_path=$5 \
--pre_trained=$6 \ --pre_trained=$6 \
@ -80,7 +81,7 @@ run_gpu()
if [ $# -gt 6 ] || [ $# -lt 4 ] if [ $# -gt 6 ] || [ $# -lt 4 ]
then then
echo "Usage:\n \ echo "Usage:\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \ Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
" "
exit 1 exit 1

View File

@ -15,7 +15,6 @@
"""launch train script""" """launch train script"""
import os import os
import sys import sys
import json
import subprocess import subprocess
import shutil import shutil
from argparse import ArgumentParser from argparse import ArgumentParser
@ -42,8 +41,6 @@ def parse_args():
"each process can be bound to a single D.") "each process can be bound to a single D.")
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7", parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
help="will use the visible devices sequentially") help="will use the visible devices sequentially")
parser.add_argument("--server_id", type=str, default="",
help="server ip")
parser.add_argument("--training_script", type=str, parser.add_argument("--training_script", type=str,
help="The full path to the single D training " help="The full path to the single D training "
"program/script to be launched in parallel, " "program/script to be launched in parallel, "
@ -63,66 +60,6 @@ def main():
assert os.path.isfile(args.training_script) assert os.path.isfile(args.training_script)
assert len(visible_devices) >= args.nproc_per_node assert len(visible_devices) >= args.nproc_per_node
print('visible_devices:{}'.format(visible_devices)) print('visible_devices:{}'.format(visible_devices))
if not args.server_id:
print('pleaser input server ip!!!')
exit(0)
print('server_id:{}'.format(args.server_id))
# construct hccn_table
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
device_ips = {}
for hccn_item in hccn_configs:
hccn_item = hccn_item.strip()
if hccn_item.startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
hccn_table = {}
hccn_table['board_id'] = '0x0000'
hccn_table['chip_info'] = '910'
hccn_table['deploy_mode'] = 'lab'
hccn_table['group_count'] = '1'
hccn_table['group_list'] = []
instance_list = []
usable_dev = ''
for instance_id in range(args.nproc_per_node):
instance = {}
instance['devices'] = []
device_id = visible_devices[instance_id]
device_ip = device_ips[device_id]
usable_dev += str(device_id)
instance['devices'].append({
'device_id': device_id,
'device_ip': device_ip,
})
instance['rank_id'] = str(instance_id)
instance['server_id'] = args.server_id
instance_list.append(instance)
hccn_table['group_list'].append({
'device_num': str(args.nproc_per_node),
'server_num': '1',
'group_name': '',
'instance_count': str(args.nproc_per_node),
'instance_list': instance_list,
})
hccn_table['para_plane_nic_location'] = 'device'
hccn_table['para_plane_nic_name'] = []
for instance_id in range(args.nproc_per_node):
eth_id = visible_devices[instance_id]
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
hccn_table['para_plane_nic_num'] = str(args.nproc_per_node)
hccn_table['status'] = 'completed'
# save hccn_table to file
table_path = os.getcwd()
if not os.path.exists(table_path):
os.mkdir(table_path)
table_fn = os.path.join(table_path,
'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id))
with open(table_fn, 'w') as table_fp:
json.dump(hccn_table, table_fp, indent=4)
sys.stdout.flush()
# spawn the processes # spawn the processes
processes = [] processes = []
@ -137,9 +74,6 @@ def main():
device_dir = os.path.join(cur_path, 'device{}'.format(rank_id)) device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)
os.mkdir(device_dir) os.mkdir(device_dir)