forked from mindspore-Ecosystem/mindspore
add hccl_config
This commit is contained in:
parent
fdc3a235e6
commit
74fcbd2900
|
@ -60,14 +60,14 @@ Dataset used: [imagenet](http://www.image-net.org/)
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]
|
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]
|
||||||
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
|
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
|
||||||
|
|
||||||
### Launch
|
### Launch
|
||||||
|
|
||||||
```
|
```
|
||||||
# training example
|
# training example
|
||||||
Ascend: sh run_train.sh Ascend 8 192.168.0.1 0,1,2,3,4,5,6,7 ~/imagenet/train/ mobilenet_199.ckpt
|
Ascend: sh run_train.sh Ascend 8 0,1,2,3,4,5,6,7 hccl_config.json ~/imagenet/train/ mobilenet_199.ckpt
|
||||||
GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
|
GPU: sh run_train.sh GPU 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
|
@ -22,14 +22,16 @@ run_ascend()
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -d $5 ]
|
if [ ! -d $5 ] && [ ! -f $5 ]
|
||||||
then
|
then
|
||||||
echo "error: DATASET_PATH=$5 is not a directory"
|
echo "error: DATASET_PATH=$5 is not a directory or file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
||||||
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
|
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
|
||||||
|
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
||||||
|
export RANK_TABLE_FILE=$4
|
||||||
if [ -d "../train" ];
|
if [ -d "../train" ];
|
||||||
then
|
then
|
||||||
rm -rf ../train
|
rm -rf ../train
|
||||||
|
@ -38,8 +40,7 @@ run_ascend()
|
||||||
cd ../train || exit
|
cd ../train || exit
|
||||||
python ${BASEPATH}/../src/launch.py \
|
python ${BASEPATH}/../src/launch.py \
|
||||||
--nproc_per_node=$2 \
|
--nproc_per_node=$2 \
|
||||||
--visible_devices=$4 \
|
--visible_devices=$3 \
|
||||||
--server_id=$3 \
|
|
||||||
--training_script=${BASEPATH}/../train.py \
|
--training_script=${BASEPATH}/../train.py \
|
||||||
--dataset_path=$5 \
|
--dataset_path=$5 \
|
||||||
--pre_trained=$6 \
|
--pre_trained=$6 \
|
||||||
|
@ -80,7 +81,7 @@ run_gpu()
|
||||||
if [ $# -gt 6 ] || [ $# -lt 4 ]
|
if [ $# -gt 6 ] || [ $# -lt 4 ]
|
||||||
then
|
then
|
||||||
echo "Usage:\n \
|
echo "Usage:\n \
|
||||||
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [SERVER_IP(x.x.x.x)] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [CKPT_PATH]\n \
|
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \
|
||||||
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
|
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
|
||||||
"
|
"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -15,7 +15,6 @@
|
||||||
"""launch train script"""
|
"""launch train script"""
|
||||||
import os
|
import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
|
||||||
import subprocess
|
import subprocess
|
||||||
import shutil
|
import shutil
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
|
@ -42,8 +41,6 @@ def parse_args():
|
||||||
"each process can be bound to a single D.")
|
"each process can be bound to a single D.")
|
||||||
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
|
parser.add_argument("--visible_devices", type=str, default="0,1,2,3,4,5,6,7",
|
||||||
help="will use the visible devices sequentially")
|
help="will use the visible devices sequentially")
|
||||||
parser.add_argument("--server_id", type=str, default="",
|
|
||||||
help="server ip")
|
|
||||||
parser.add_argument("--training_script", type=str,
|
parser.add_argument("--training_script", type=str,
|
||||||
help="The full path to the single D training "
|
help="The full path to the single D training "
|
||||||
"program/script to be launched in parallel, "
|
"program/script to be launched in parallel, "
|
||||||
|
@ -63,66 +60,6 @@ def main():
|
||||||
assert os.path.isfile(args.training_script)
|
assert os.path.isfile(args.training_script)
|
||||||
assert len(visible_devices) >= args.nproc_per_node
|
assert len(visible_devices) >= args.nproc_per_node
|
||||||
print('visible_devices:{}'.format(visible_devices))
|
print('visible_devices:{}'.format(visible_devices))
|
||||||
if not args.server_id:
|
|
||||||
print('pleaser input server ip!!!')
|
|
||||||
exit(0)
|
|
||||||
print('server_id:{}'.format(args.server_id))
|
|
||||||
|
|
||||||
# construct hccn_table
|
|
||||||
hccn_configs = open('/etc/hccn.conf', 'r').readlines()
|
|
||||||
device_ips = {}
|
|
||||||
for hccn_item in hccn_configs:
|
|
||||||
hccn_item = hccn_item.strip()
|
|
||||||
if hccn_item.startswith('address_'):
|
|
||||||
device_id, device_ip = hccn_item.split('=')
|
|
||||||
device_id = device_id.split('_')[1]
|
|
||||||
device_ips[device_id] = device_ip
|
|
||||||
print('device_id:{}, device_ip:{}'.format(device_id, device_ip))
|
|
||||||
hccn_table = {}
|
|
||||||
hccn_table['board_id'] = '0x0000'
|
|
||||||
hccn_table['chip_info'] = '910'
|
|
||||||
hccn_table['deploy_mode'] = 'lab'
|
|
||||||
hccn_table['group_count'] = '1'
|
|
||||||
hccn_table['group_list'] = []
|
|
||||||
instance_list = []
|
|
||||||
usable_dev = ''
|
|
||||||
for instance_id in range(args.nproc_per_node):
|
|
||||||
instance = {}
|
|
||||||
instance['devices'] = []
|
|
||||||
device_id = visible_devices[instance_id]
|
|
||||||
device_ip = device_ips[device_id]
|
|
||||||
usable_dev += str(device_id)
|
|
||||||
instance['devices'].append({
|
|
||||||
'device_id': device_id,
|
|
||||||
'device_ip': device_ip,
|
|
||||||
})
|
|
||||||
instance['rank_id'] = str(instance_id)
|
|
||||||
instance['server_id'] = args.server_id
|
|
||||||
instance_list.append(instance)
|
|
||||||
hccn_table['group_list'].append({
|
|
||||||
'device_num': str(args.nproc_per_node),
|
|
||||||
'server_num': '1',
|
|
||||||
'group_name': '',
|
|
||||||
'instance_count': str(args.nproc_per_node),
|
|
||||||
'instance_list': instance_list,
|
|
||||||
})
|
|
||||||
hccn_table['para_plane_nic_location'] = 'device'
|
|
||||||
hccn_table['para_plane_nic_name'] = []
|
|
||||||
for instance_id in range(args.nproc_per_node):
|
|
||||||
eth_id = visible_devices[instance_id]
|
|
||||||
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
|
|
||||||
hccn_table['para_plane_nic_num'] = str(args.nproc_per_node)
|
|
||||||
hccn_table['status'] = 'completed'
|
|
||||||
|
|
||||||
# save hccn_table to file
|
|
||||||
table_path = os.getcwd()
|
|
||||||
if not os.path.exists(table_path):
|
|
||||||
os.mkdir(table_path)
|
|
||||||
table_fn = os.path.join(table_path,
|
|
||||||
'rank_table_{}p_{}_{}.json'.format(args.nproc_per_node, usable_dev, args.server_id))
|
|
||||||
with open(table_fn, 'w') as table_fp:
|
|
||||||
json.dump(hccn_table, table_fp, indent=4)
|
|
||||||
sys.stdout.flush()
|
|
||||||
|
|
||||||
# spawn the processes
|
# spawn the processes
|
||||||
processes = []
|
processes = []
|
||||||
|
@ -137,9 +74,6 @@ def main():
|
||||||
device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
|
device_dir = os.path.join(cur_path, 'device{}'.format(rank_id))
|
||||||
env['RANK_ID'] = str(rank_id)
|
env['RANK_ID'] = str(rank_id)
|
||||||
env['DEVICE_ID'] = str(device_id)
|
env['DEVICE_ID'] = str(device_id)
|
||||||
if args.nproc_per_node > 1:
|
|
||||||
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
|
|
||||||
env['RANK_TABLE_FILE'] = table_fn
|
|
||||||
if os.path.exists(device_dir):
|
if os.path.exists(device_dir):
|
||||||
shutil.rmtree(device_dir)
|
shutil.rmtree(device_dir)
|
||||||
os.mkdir(device_dir)
|
os.mkdir(device_dir)
|
||||||
|
|
Loading…
Reference in New Issue