add_python_distribute_pretrain_script

Signed-off-by: GuoMengHao <guomenghao@huawei.com>
This commit is contained in:
GuoMengHao 2020-07-27 11:21:43 +08:00
parent c22792aab1
commit 2309e7369a
11 changed files with 425 additions and 80 deletions

View File

@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash ``` bash
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
``` ```
### Fine-Tuning and Evaluation ### Fine-Tuning and Evaluation

View File

@ -0,0 +1,48 @@
# Run distribute pretrain
## description
The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that.
## how to use
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
```
python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
```
output:
```
hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
the number of logical core: 192
avg_core_per_rank: 96
rank_size: 2
start training for rank 0, device 5:
rank_id: 0
device_id: 5
core nums: 0-95
epoch_size: 8
data_dir: /data/small_512/
schema_dir:
log file dir: ./LOG5/log.txt
start training for rank 1, device 6:
rank_id: 1
device_id: 6
core nums: 96-191
epoch_size: 8
data_dir: /data/small_512/
schema_dir:
log file dir: ./LOG6/log.txt
```
## Note
1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
device_id
device_num
3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.

View File

@ -0,0 +1,11 @@
[config]
distribute=true
epoch_size=40
enable_save_ckpt=true
enable_lossscale=true
do_shuffle=true
enable_data_sink=true
data_sink_steps=100
save_checkpoint_path=./checkpoint/
save_checkpoint_steps=10000
save_checkpoint_num=1

View File

@ -0,0 +1,142 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""distribute pretrain script"""
import os
import json
import configparser
import multiprocessing
from argparse import ArgumentParser
def parse_args():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
parser = ArgumentParser(description="mindspore distributed training")
parser.add_argument("--run_script_dir", type=str, default="",
help="Run script path, it is better to use absolute path")
parser.add_argument("--hyper_parameter_config_dir", type=str, default="",
help="Hyper Parameter config path, it is better to use absolute path")
parser.add_argument("--data_dir", type=str, default="",
help="Data path, it is better to use absolute path")
parser.add_argument("--hccl_config_dir", type=str, default="",
help="Hccl config path, it is better to use absolute path")
args = parser.parse_args()
return args
def distribute_pretrain():
"""
distribute pretrain scripts. The number of D chips can be automatically allocated
based on the device_num set in hccl config file, You don not need to specify that.
"""
print("start", __file__)
args = parse_args()
run_script = args.run_script_dir
data_dir = args.data_dir
cf = configparser.ConfigParser()
cf.read(args.hyper_parameter_config_dir)
cfg = dict(cf.items("config"))
print("hccl_config_dir:", args.hccl_config_dir)
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir
os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
cores = multiprocessing.cpu_count()
print("the number of logical core:", cores)
# get device_ips
device_ips = {}
with open('/etc/hccn.conf', 'r') as fin:
for hccn_item in fin.readlines():
if hccn_item.strip().startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip.strip()
with open(args.hccl_config_dir, "r", encoding="utf-8") as fin:
hccl_config = json.loads(fin.read())
rank_size = 0
for server in hccl_config["server_list"]:
rank_size += len(server["device"])
if server["device"][0]["device_ip"] in device_ips.values():
this_server = server
os.environ['RANK_SIZE'] = str(rank_size)
print("total rank size:", rank_size)
print("this server rank size:", len(this_server["device"]))
avg_core_per_rank = int(int(cores) / len(this_server["device"]))
core_gap = avg_core_per_rank - 1
print("avg_core_per_rank:", avg_core_per_rank)
count = 0
for instance in this_server["device"]:
device_id = instance["device_id"]
rank_id = instance["rank_id"]
print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":")
print("rank_id:", rank_id)
print("device_id:", device_id)
start = count * int(avg_core_per_rank)
count += 1
end = start + core_gap
cmdopt = str(start) + "-" + str(end)
os.environ["DEVICE_ID"] = device_id
os.environ["RANK_ID"] = rank_id
os.environ["DEPLOY_MODE"] = "0"
os.environ["GE_USE_STATIC_MEMORY"] = "1"
os.system("rm -rf LOG" + str(device_id))
os.system("mkdir ./LOG" + str(device_id))
os.system("cp *.py ./LOG" + str(device_id))
os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
os.system("env > ./LOG" + str(device_id) + "/env.log")
cur_dir = os.getcwd()
os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
os.environ["GLOG_logtostderr"] = "0"
print("core_nums:", cmdopt)
print("epoch_size:", str(cfg['epoch_size']))
print("data_dir:", data_dir)
print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
" 'device_num' or 'data_dir'! ")
cmd += opt
cmd += " --data_dir=" + data_dir
cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
+ str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &'
os.system(cmd)
if __name__ == "__main__":
distribute_pretrain()

View File

@ -16,57 +16,16 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json" echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "For hyper parameter, please note that you should customize the scripts:
'{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
echo "==============================================================================================================" echo "=============================================================================================================="
EPOCH_SIZE=$2
DATA_DIR=$3
SCHEMA_DIR=$4
PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
export RANK_TABLE_FILE=$5
export RANK_SIZE=$1
cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
echo "the number of logical core" $cores
avg_core_per_rank=`expr $cores \/ $RANK_SIZE`
core_gap=`expr $avg_core_per_rank \- 1`
echo "avg_core_per_rank" $avg_core_per_rank
echo "core_gap" $core_gap
for((i=0;i<RANK_SIZE;i++))
do
start=`expr $i \* $avg_core_per_rank`
export DEVICE_ID=$i
export RANK_ID=$i
export DEPLOY_MODE=0
export GE_USE_STATIC_MEMORY=1
end=`expr $start \+ $core_gap`
cmdopt=$start"-"$end
rm -rf LOG$i
mkdir ./LOG$i
cp *.py ./LOG$i
cd ./LOG$i || exit
echo "start training for rank $i, device $DEVICE_ID"
mkdir -p ms_log
CUR_DIR=`pwd` CUR_DIR=`pwd`
export GLOG_log_dir=${CUR_DIR}/ms_log
export GLOG_logtostderr=0 python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \
env > env.log --run_script_dir=${CUR_DIR}/run_pretrain.py \
taskset -c $cmdopt python ${PROJECT_DIR}/../run_pretrain.py \ --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
--distribute="true" \ --data_dir=$1 \
--epoch_size=$EPOCH_SIZE \ --hccl_config_dir=$2
--device_id=$DEVICE_ID \
--device_num=$RANK_SIZE \
--enable_save_ckpt="true" \
--enable_lossscale="true" \
--do_shuffle="true" \
--enable_data_sink="true" \
--data_sink_steps=100 \
--load_checkpoint_path="" \
--save_checkpoint_steps=10000 \
--save_checkpoint_num=1 \
--data_dir=$DATA_DIR \
--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
cd ../
done

View File

@ -0,0 +1,48 @@
# Run distribute pretrain
## description
The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that.
## how to use
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
```
python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
```
output:
```
hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
the number of logical core: 192
avg_core_per_rank: 96
rank_size: 2
start training for rank 0, device 5:
rank_id: 0
device_id: 5
core nums: 0-95
epoch_size: 8
data_dir: /data/small_512/
schema_dir:
log file dir: ./LOG5/log.txt
start training for rank 1, device 6:
rank_id: 1
device_id: 6
core nums: 96-191
epoch_size: 8
data_dir: /data/small_512/
schema_dir:
log file dir: ./LOG6/log.txt
```
## Note
1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
device_id
device_num
3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.

View File

@ -0,0 +1,11 @@
[config]
distribute=true
epoch_size=40
enable_save_ckpt=true
enable_lossscale=true
do_shuffle=true
enable_data_sink=true
data_sink_steps=100
save_checkpoint_path=./checkpoint/
save_checkpoint_steps=10000
save_checkpoint_num=1

View File

@ -0,0 +1,142 @@
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
"""distribute pretrain script"""
import os
import json
import configparser
import multiprocessing
from argparse import ArgumentParser
def parse_args():
"""
parse args .
Args:
Returns:
args.
Examples:
>>> parse_args()
"""
parser = ArgumentParser(description="mindspore distributed training")
parser.add_argument("--run_script_dir", type=str, default="",
help="Run script path, it is better to use absolute path")
parser.add_argument("--hyper_parameter_config_dir", type=str, default="",
help="Hyper Parameter config path, it is better to use absolute path")
parser.add_argument("--data_dir", type=str, default="",
help="Data path, it is better to use absolute path")
parser.add_argument("--hccl_config_dir", type=str, default="",
help="Hccl config path, it is better to use absolute path")
args = parser.parse_args()
return args
def distribute_pretrain():
"""
distribute pretrain scripts. The number of D chips can be automatically allocated
based on the device_num set in hccl config file, You don not need to specify that.
"""
print("start", __file__)
args = parse_args()
run_script = args.run_script_dir
data_dir = args.data_dir
cf = configparser.ConfigParser()
cf.read(args.hyper_parameter_config_dir)
cfg = dict(cf.items("config"))
print("hccl_config_dir:", args.hccl_config_dir)
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir
os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
cores = multiprocessing.cpu_count()
print("the number of logical core:", cores)
# get device_ips
device_ips = {}
with open('/etc/hccn.conf', 'r') as fin:
for hccn_item in fin.readlines():
if hccn_item.strip().startswith('address_'):
device_id, device_ip = hccn_item.split('=')
device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip.strip()
with open(args.hccl_config_dir, "r", encoding="utf-8") as fin:
hccl_config = json.loads(fin.read())
rank_size = 0
for server in hccl_config["server_list"]:
rank_size += len(server["device"])
if server["device"][0]["device_ip"] in device_ips.values():
this_server = server
os.environ['RANK_SIZE'] = str(rank_size)
print("total rank size:", rank_size)
print("this server rank size:", len(this_server["device"]))
avg_core_per_rank = int(int(cores) / len(this_server["device"]))
core_gap = avg_core_per_rank - 1
print("avg_core_per_rank:", avg_core_per_rank)
count = 0
for instance in this_server["device"]:
device_id = instance["device_id"]
rank_id = instance["rank_id"]
print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":")
print("rank_id:", rank_id)
print("device_id:", device_id)
start = count * int(avg_core_per_rank)
count += 1
end = start + core_gap
cmdopt = str(start) + "-" + str(end)
os.environ["DEVICE_ID"] = device_id
os.environ["RANK_ID"] = rank_id
os.environ["DEPLOY_MODE"] = "0"
os.environ["GE_USE_STATIC_MEMORY"] = "1"
os.system("rm -rf LOG" + str(device_id))
os.system("mkdir ./LOG" + str(device_id))
os.system("cp *.py ./LOG" + str(device_id))
os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
os.system("env > ./LOG" + str(device_id) + "/env.log")
cur_dir = os.getcwd()
os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
os.environ["GLOG_logtostderr"] = "0"
print("core_nums:", cmdopt)
print("epoch_size:", str(cfg['epoch_size']))
print("data_dir:", data_dir)
print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
" 'device_num' or 'data_dir'! ")
cmd += opt
cmd += " --data_dir=" + data_dir
cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
+ str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &'
os.system(cmd)
if __name__ == "__main__":
distribute_pretrain()

View File

@ -17,7 +17,6 @@ import os
import sys import sys
import json import json
import socket import socket
import platform
from argparse import ArgumentParser from argparse import ArgumentParser
from typing import Dict, Any from typing import Dict, Any
@ -114,40 +113,25 @@ def main():
device_id = device_id.split('_')[1] device_id = device_id.split('_')[1]
device_ips[device_id] = device_ip.strip() device_ips[device_id] = device_ip.strip()
arch = platform.processor() hccn_table = {'version': '1.0',
hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch], 'server_count': '1',
'chip_info': '910', 'server_list': []}
'deploy_mode': 'lab', device_list = []
'group_count': '1',
'group_list': []}
instance_list = []
rank_id = 0 rank_id = 0
for instance_id in device_num_list: for instance_id in device_num_list:
instance = {'devices': []}
device_id = visible_devices[instance_id] device_id = visible_devices[instance_id]
device_ip = device_ips[device_id] device_ip = device_ips[device_id]
instance['devices'].append({ device = {'device_id': device_id,
'device_id': device_id,
'device_ip': device_ip, 'device_ip': device_ip,
}) 'rank_id': str(rank_id)}
print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip)) print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
instance['rank_id'] = str(rank_id)
rank_id += 1 rank_id += 1
instance['server_id'] = server_id device_list.append(device)
instance_list.append(instance) hccn_table['server_list'].append({
hccn_table['group_list'].append({ 'server_id': server_id,
'device_num': str(len(device_num_list)), 'device': device_list,
'server_num': '1', 'host_nic_ip': 'reserve'
'group_name': '',
'instance_count': str(len(device_num_list)),
'instance_list': instance_list,
}) })
hccn_table['para_plane_nic_location'] = 'device'
hccn_table['para_plane_nic_name'] = []
for instance_id in device_num_list:
eth_id = visible_devices[instance_id]
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
hccn_table['para_plane_nic_num'] = str(len(device_num_list))
hccn_table['status'] = 'completed' hccn_table['status'] = 'completed'
# save hccn_table to file # save hccn_table to file