diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md index b971efd0f9f..a54c1faf991 100644 --- a/model_zoo/official/nlp/bert/README.md +++ b/model_zoo/official/nlp/bert/README.md @@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base]( - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. ``` bash - sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH + sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH ``` ### Fine-Tuning and Evaluation diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md new file mode 100644 index 00000000000..b492c4c3097 --- /dev/null +++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md @@ -0,0 +1,48 @@ +# Run distribute pretrain + +## description +The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that. + + +## how to use +For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: +``` +python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json +``` + +output: + +``` +hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json +the number of logical core: 192 +avg_core_per_rank: 96 +rank_size: 2 + +start training for rank 0, device 5: +rank_id: 0 +device_id: 5 +core nums: 0-95 +epoch_size: 8 +data_dir: /data/small_512/ +schema_dir: +log file dir: ./LOG5/log.txt + +start training for rank 1, device 6: +rank_id: 1 +device_id: 6 +core nums: 96-191 +epoch_size: 8 +data_dir: /data/small_512/ +schema_dir: +log file dir: ./LOG6/log.txt +``` + +## Note + +1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. + +2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: + device_id + device_num + +3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/__init__.py b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini new file mode 100644 index 00000000000..2298f83509b --- /dev/null +++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini @@ -0,0 +1,11 @@ +[config] +distribute=true +epoch_size=40 +enable_save_ckpt=true +enable_lossscale=true +do_shuffle=true +enable_data_sink=true +data_sink_steps=100 +save_checkpoint_path=./checkpoint/ +save_checkpoint_steps=10000 +save_checkpoint_num=1 \ No newline at end of file diff --git a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py new file mode 100644 index 00000000000..b230f71fad2 --- /dev/null +++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py @@ -0,0 +1,142 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""distribute pretrain script""" +import os +import json +import configparser +import multiprocessing +from argparse import ArgumentParser + + +def parse_args(): + """ + parse args . + + Args: + + Returns: + args. + + Examples: + >>> parse_args() + """ + parser = ArgumentParser(description="mindspore distributed training") + + parser.add_argument("--run_script_dir", type=str, default="", + help="Run script path, it is better to use absolute path") + parser.add_argument("--hyper_parameter_config_dir", type=str, default="", + help="Hyper Parameter config path, it is better to use absolute path") + parser.add_argument("--data_dir", type=str, default="", + help="Data path, it is better to use absolute path") + parser.add_argument("--hccl_config_dir", type=str, default="", + help="Hccl config path, it is better to use absolute path") + + args = parser.parse_args() + return args + + +def distribute_pretrain(): + """ + distribute pretrain scripts. The number of D chips can be automatically allocated + based on the device_num set in hccl config file, You don not need to specify that. + """ + print("start", __file__) + args = parse_args() + + run_script = args.run_script_dir + data_dir = args.data_dir + cf = configparser.ConfigParser() + cf.read(args.hyper_parameter_config_dir) + cfg = dict(cf.items("config")) + + print("hccl_config_dir:", args.hccl_config_dir) + os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir + os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir + + cores = multiprocessing.cpu_count() + print("the number of logical core:", cores) + + # get device_ips + device_ips = {} + with open('/etc/hccn.conf', 'r') as fin: + for hccn_item in fin.readlines(): + if hccn_item.strip().startswith('address_'): + device_id, device_ip = hccn_item.split('=') + device_id = device_id.split('_')[1] + device_ips[device_id] = device_ip.strip() + + with open(args.hccl_config_dir, "r", encoding="utf-8") as fin: + hccl_config = json.loads(fin.read()) + rank_size = 0 + for server in hccl_config["server_list"]: + rank_size += len(server["device"]) + if server["device"][0]["device_ip"] in device_ips.values(): + this_server = server + + os.environ['RANK_SIZE'] = str(rank_size) + print("total rank size:", rank_size) + print("this server rank size:", len(this_server["device"])) + avg_core_per_rank = int(int(cores) / len(this_server["device"])) + core_gap = avg_core_per_rank - 1 + print("avg_core_per_rank:", avg_core_per_rank) + + count = 0 + for instance in this_server["device"]: + device_id = instance["device_id"] + rank_id = instance["rank_id"] + print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":") + print("rank_id:", rank_id) + print("device_id:", device_id) + + start = count * int(avg_core_per_rank) + count += 1 + end = start + core_gap + cmdopt = str(start) + "-" + str(end) + + os.environ["DEVICE_ID"] = device_id + os.environ["RANK_ID"] = rank_id + os.environ["DEPLOY_MODE"] = "0" + os.environ["GE_USE_STATIC_MEMORY"] = "1" + + os.system("rm -rf LOG" + str(device_id)) + os.system("mkdir ./LOG" + str(device_id)) + os.system("cp *.py ./LOG" + str(device_id)) + os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") + os.system("env > ./LOG" + str(device_id) + "/env.log") + + cur_dir = os.getcwd() + os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" + os.environ["GLOG_logtostderr"] = "0" + + print("core_nums:", cmdopt) + print("epoch_size:", str(cfg['epoch_size'])) + print("data_dir:", data_dir) + print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") + + cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " + opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) + if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): + raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," + " 'device_num' or 'data_dir'! ") + cmd += opt + cmd += " --data_dir=" + data_dir + cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ + + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' + + os.system(cmd) + + +if __name__ == "__main__": + distribute_pretrain() diff --git a/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh b/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh index eb3a0979d15..422309fea45 100644 --- a/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh +++ b/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh @@ -16,57 +16,16 @@ echo "==============================================================================================================" echo "Please run the scipt as: " -echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" -echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json" +echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH" +echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json" echo "It is better to use absolute path." +echo "For hyper parameter, please note that you should customize the scripts: + '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' " echo "==============================================================================================================" +CUR_DIR=`pwd` -EPOCH_SIZE=$2 -DATA_DIR=$3 -SCHEMA_DIR=$4 -PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) -export RANK_TABLE_FILE=$5 -export RANK_SIZE=$1 -cores=`cat /proc/cpuinfo|grep "processor" |wc -l` -echo "the number of logical core" $cores -avg_core_per_rank=`expr $cores \/ $RANK_SIZE` -core_gap=`expr $avg_core_per_rank \- 1` -echo "avg_core_per_rank" $avg_core_per_rank -echo "core_gap" $core_gap -for((i=0;i env.log - taskset -c $cmdopt python ${PROJECT_DIR}/../run_pretrain.py \ - --distribute="true" \ - --epoch_size=$EPOCH_SIZE \ - --device_id=$DEVICE_ID \ - --device_num=$RANK_SIZE \ - --enable_save_ckpt="true" \ - --enable_lossscale="true" \ - --do_shuffle="true" \ - --enable_data_sink="true" \ - --data_sink_steps=100 \ - --load_checkpoint_path="" \ - --save_checkpoint_steps=10000 \ - --save_checkpoint_num=1 \ - --data_dir=$DATA_DIR \ - --schema_dir=$SCHEMA_DIR > log.txt 2>&1 & - cd ../ -done +python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \ + --run_script_dir=${CUR_DIR}/run_pretrain.py \ + --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \ + --data_dir=$1 \ + --hccl_config_dir=$2 diff --git a/model_zoo/utils/ascend_distributed_launcher/README.md b/model_zoo/utils/ascend_distributed_launcher/README.md new file mode 100644 index 00000000000..cefdaee3e8c --- /dev/null +++ b/model_zoo/utils/ascend_distributed_launcher/README.md @@ -0,0 +1,48 @@ +# Run distribute pretrain + +## description +The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that. + + +## how to use +For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir: +``` +python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json +``` + +output: + +``` +hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json +the number of logical core: 192 +avg_core_per_rank: 96 +rank_size: 2 + +start training for rank 0, device 5: +rank_id: 0 +device_id: 5 +core nums: 0-95 +epoch_size: 8 +data_dir: /data/small_512/ +schema_dir: +log file dir: ./LOG5/log.txt + +start training for rank 1, device 6: +rank_id: 1 +device_id: 6 +core nums: 96-191 +epoch_size: 8 +data_dir: /data/small_512/ +schema_dir: +log file dir: ./LOG6/log.txt +``` + +## Note + +1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate. + +2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here: + device_id + device_num + +3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`. diff --git a/model_zoo/utils/ascend_distributed_launcher/__init__.py b/model_zoo/utils/ascend_distributed_launcher/__init__.py new file mode 100644 index 00000000000..e69de29bb2d diff --git a/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini b/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini new file mode 100644 index 00000000000..2298f83509b --- /dev/null +++ b/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini @@ -0,0 +1,11 @@ +[config] +distribute=true +epoch_size=40 +enable_save_ckpt=true +enable_lossscale=true +do_shuffle=true +enable_data_sink=true +data_sink_steps=100 +save_checkpoint_path=./checkpoint/ +save_checkpoint_steps=10000 +save_checkpoint_num=1 \ No newline at end of file diff --git a/model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py b/model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py new file mode 100644 index 00000000000..b230f71fad2 --- /dev/null +++ b/model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py @@ -0,0 +1,142 @@ +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ +"""distribute pretrain script""" +import os +import json +import configparser +import multiprocessing +from argparse import ArgumentParser + + +def parse_args(): + """ + parse args . + + Args: + + Returns: + args. + + Examples: + >>> parse_args() + """ + parser = ArgumentParser(description="mindspore distributed training") + + parser.add_argument("--run_script_dir", type=str, default="", + help="Run script path, it is better to use absolute path") + parser.add_argument("--hyper_parameter_config_dir", type=str, default="", + help="Hyper Parameter config path, it is better to use absolute path") + parser.add_argument("--data_dir", type=str, default="", + help="Data path, it is better to use absolute path") + parser.add_argument("--hccl_config_dir", type=str, default="", + help="Hccl config path, it is better to use absolute path") + + args = parser.parse_args() + return args + + +def distribute_pretrain(): + """ + distribute pretrain scripts. The number of D chips can be automatically allocated + based on the device_num set in hccl config file, You don not need to specify that. + """ + print("start", __file__) + args = parse_args() + + run_script = args.run_script_dir + data_dir = args.data_dir + cf = configparser.ConfigParser() + cf.read(args.hyper_parameter_config_dir) + cfg = dict(cf.items("config")) + + print("hccl_config_dir:", args.hccl_config_dir) + os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir + os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir + + cores = multiprocessing.cpu_count() + print("the number of logical core:", cores) + + # get device_ips + device_ips = {} + with open('/etc/hccn.conf', 'r') as fin: + for hccn_item in fin.readlines(): + if hccn_item.strip().startswith('address_'): + device_id, device_ip = hccn_item.split('=') + device_id = device_id.split('_')[1] + device_ips[device_id] = device_ip.strip() + + with open(args.hccl_config_dir, "r", encoding="utf-8") as fin: + hccl_config = json.loads(fin.read()) + rank_size = 0 + for server in hccl_config["server_list"]: + rank_size += len(server["device"]) + if server["device"][0]["device_ip"] in device_ips.values(): + this_server = server + + os.environ['RANK_SIZE'] = str(rank_size) + print("total rank size:", rank_size) + print("this server rank size:", len(this_server["device"])) + avg_core_per_rank = int(int(cores) / len(this_server["device"])) + core_gap = avg_core_per_rank - 1 + print("avg_core_per_rank:", avg_core_per_rank) + + count = 0 + for instance in this_server["device"]: + device_id = instance["device_id"] + rank_id = instance["rank_id"] + print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":") + print("rank_id:", rank_id) + print("device_id:", device_id) + + start = count * int(avg_core_per_rank) + count += 1 + end = start + core_gap + cmdopt = str(start) + "-" + str(end) + + os.environ["DEVICE_ID"] = device_id + os.environ["RANK_ID"] = rank_id + os.environ["DEPLOY_MODE"] = "0" + os.environ["GE_USE_STATIC_MEMORY"] = "1" + + os.system("rm -rf LOG" + str(device_id)) + os.system("mkdir ./LOG" + str(device_id)) + os.system("cp *.py ./LOG" + str(device_id)) + os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log") + os.system("env > ./LOG" + str(device_id) + "/env.log") + + cur_dir = os.getcwd() + os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log" + os.environ["GLOG_logtostderr"] = "0" + + print("core_nums:", cmdopt) + print("epoch_size:", str(cfg['epoch_size'])) + print("data_dir:", data_dir) + print("log_file_dir: ./LOG" + str(device_id) + "/log.txt") + + cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " " + opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()]) + if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt): + raise ValueError("hyper_parameter_config.ini can not setting 'device_id'," + " 'device_num' or 'data_dir'! ") + cmd += opt + cmd += " --data_dir=" + data_dir + cmd += ' --device_id=' + str(device_id) + ' --device_num=' \ + + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &' + + os.system(cmd) + + +if __name__ == "__main__": + distribute_pretrain() diff --git a/model_zoo/utils/hccl_tools/hccl_tools.py b/model_zoo/utils/hccl_tools/hccl_tools.py index ac4114c0a8d..5afcb231599 100644 --- a/model_zoo/utils/hccl_tools/hccl_tools.py +++ b/model_zoo/utils/hccl_tools/hccl_tools.py @@ -17,7 +17,6 @@ import os import sys import json import socket -import platform from argparse import ArgumentParser from typing import Dict, Any @@ -114,40 +113,25 @@ def main(): device_id = device_id.split('_')[1] device_ips[device_id] = device_ip.strip() - arch = platform.processor() - hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch], - 'chip_info': '910', - 'deploy_mode': 'lab', - 'group_count': '1', - 'group_list': []} - instance_list = [] + hccn_table = {'version': '1.0', + 'server_count': '1', + 'server_list': []} + device_list = [] rank_id = 0 for instance_id in device_num_list: - instance = {'devices': []} device_id = visible_devices[instance_id] device_ip = device_ips[device_id] - instance['devices'].append({ - 'device_id': device_id, - 'device_ip': device_ip, - }) + device = {'device_id': device_id, + 'device_ip': device_ip, + 'rank_id': str(rank_id)} print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip)) - instance['rank_id'] = str(rank_id) rank_id += 1 - instance['server_id'] = server_id - instance_list.append(instance) - hccn_table['group_list'].append({ - 'device_num': str(len(device_num_list)), - 'server_num': '1', - 'group_name': '', - 'instance_count': str(len(device_num_list)), - 'instance_list': instance_list, + device_list.append(device) + hccn_table['server_list'].append({ + 'server_id': server_id, + 'device': device_list, + 'host_nic_ip': 'reserve' }) - hccn_table['para_plane_nic_location'] = 'device' - hccn_table['para_plane_nic_name'] = [] - for instance_id in device_num_list: - eth_id = visible_devices[instance_id] - hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id)) - hccn_table['para_plane_nic_num'] = str(len(device_num_list)) hccn_table['status'] = 'completed' # save hccn_table to file