forked from mindspore-Ecosystem/mindspore
add_python_distribute_pretrain_script
Signed-off-by: GuoMengHao <guomenghao@huawei.com>
This commit is contained in:
parent
c22792aab1
commit
2309e7369a
|
@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
|
||||||
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
|
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
|
sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
|
||||||
```
|
```
|
||||||
|
|
||||||
### Fine-Tuning and Evaluation
|
### Fine-Tuning and Evaluation
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
# Run distribute pretrain
|
||||||
|
|
||||||
|
## description
|
||||||
|
The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that.
|
||||||
|
|
||||||
|
|
||||||
|
## how to use
|
||||||
|
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
|
||||||
|
```
|
||||||
|
python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
|
||||||
|
```
|
||||||
|
|
||||||
|
output:
|
||||||
|
|
||||||
|
```
|
||||||
|
hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
|
||||||
|
the number of logical core: 192
|
||||||
|
avg_core_per_rank: 96
|
||||||
|
rank_size: 2
|
||||||
|
|
||||||
|
start training for rank 0, device 5:
|
||||||
|
rank_id: 0
|
||||||
|
device_id: 5
|
||||||
|
core nums: 0-95
|
||||||
|
epoch_size: 8
|
||||||
|
data_dir: /data/small_512/
|
||||||
|
schema_dir:
|
||||||
|
log file dir: ./LOG5/log.txt
|
||||||
|
|
||||||
|
start training for rank 1, device 6:
|
||||||
|
rank_id: 1
|
||||||
|
device_id: 6
|
||||||
|
core nums: 96-191
|
||||||
|
epoch_size: 8
|
||||||
|
data_dir: /data/small_512/
|
||||||
|
schema_dir:
|
||||||
|
log file dir: ./LOG6/log.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Note
|
||||||
|
|
||||||
|
1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
|
||||||
|
|
||||||
|
2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
|
||||||
|
device_id
|
||||||
|
device_num
|
||||||
|
|
||||||
|
3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.
|
|
@ -0,0 +1,11 @@
|
||||||
|
[config]
|
||||||
|
distribute=true
|
||||||
|
epoch_size=40
|
||||||
|
enable_save_ckpt=true
|
||||||
|
enable_lossscale=true
|
||||||
|
do_shuffle=true
|
||||||
|
enable_data_sink=true
|
||||||
|
data_sink_steps=100
|
||||||
|
save_checkpoint_path=./checkpoint/
|
||||||
|
save_checkpoint_steps=10000
|
||||||
|
save_checkpoint_num=1
|
|
@ -0,0 +1,142 @@
|
||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""distribute pretrain script"""
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import configparser
|
||||||
|
import multiprocessing
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
"""
|
||||||
|
parse args .
|
||||||
|
|
||||||
|
Args:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
args.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> parse_args()
|
||||||
|
"""
|
||||||
|
parser = ArgumentParser(description="mindspore distributed training")
|
||||||
|
|
||||||
|
parser.add_argument("--run_script_dir", type=str, default="",
|
||||||
|
help="Run script path, it is better to use absolute path")
|
||||||
|
parser.add_argument("--hyper_parameter_config_dir", type=str, default="",
|
||||||
|
help="Hyper Parameter config path, it is better to use absolute path")
|
||||||
|
parser.add_argument("--data_dir", type=str, default="",
|
||||||
|
help="Data path, it is better to use absolute path")
|
||||||
|
parser.add_argument("--hccl_config_dir", type=str, default="",
|
||||||
|
help="Hccl config path, it is better to use absolute path")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def distribute_pretrain():
|
||||||
|
"""
|
||||||
|
distribute pretrain scripts. The number of D chips can be automatically allocated
|
||||||
|
based on the device_num set in hccl config file, You don not need to specify that.
|
||||||
|
"""
|
||||||
|
print("start", __file__)
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
run_script = args.run_script_dir
|
||||||
|
data_dir = args.data_dir
|
||||||
|
cf = configparser.ConfigParser()
|
||||||
|
cf.read(args.hyper_parameter_config_dir)
|
||||||
|
cfg = dict(cf.items("config"))
|
||||||
|
|
||||||
|
print("hccl_config_dir:", args.hccl_config_dir)
|
||||||
|
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir
|
||||||
|
os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
|
||||||
|
|
||||||
|
cores = multiprocessing.cpu_count()
|
||||||
|
print("the number of logical core:", cores)
|
||||||
|
|
||||||
|
# get device_ips
|
||||||
|
device_ips = {}
|
||||||
|
with open('/etc/hccn.conf', 'r') as fin:
|
||||||
|
for hccn_item in fin.readlines():
|
||||||
|
if hccn_item.strip().startswith('address_'):
|
||||||
|
device_id, device_ip = hccn_item.split('=')
|
||||||
|
device_id = device_id.split('_')[1]
|
||||||
|
device_ips[device_id] = device_ip.strip()
|
||||||
|
|
||||||
|
with open(args.hccl_config_dir, "r", encoding="utf-8") as fin:
|
||||||
|
hccl_config = json.loads(fin.read())
|
||||||
|
rank_size = 0
|
||||||
|
for server in hccl_config["server_list"]:
|
||||||
|
rank_size += len(server["device"])
|
||||||
|
if server["device"][0]["device_ip"] in device_ips.values():
|
||||||
|
this_server = server
|
||||||
|
|
||||||
|
os.environ['RANK_SIZE'] = str(rank_size)
|
||||||
|
print("total rank size:", rank_size)
|
||||||
|
print("this server rank size:", len(this_server["device"]))
|
||||||
|
avg_core_per_rank = int(int(cores) / len(this_server["device"]))
|
||||||
|
core_gap = avg_core_per_rank - 1
|
||||||
|
print("avg_core_per_rank:", avg_core_per_rank)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for instance in this_server["device"]:
|
||||||
|
device_id = instance["device_id"]
|
||||||
|
rank_id = instance["rank_id"]
|
||||||
|
print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":")
|
||||||
|
print("rank_id:", rank_id)
|
||||||
|
print("device_id:", device_id)
|
||||||
|
|
||||||
|
start = count * int(avg_core_per_rank)
|
||||||
|
count += 1
|
||||||
|
end = start + core_gap
|
||||||
|
cmdopt = str(start) + "-" + str(end)
|
||||||
|
|
||||||
|
os.environ["DEVICE_ID"] = device_id
|
||||||
|
os.environ["RANK_ID"] = rank_id
|
||||||
|
os.environ["DEPLOY_MODE"] = "0"
|
||||||
|
os.environ["GE_USE_STATIC_MEMORY"] = "1"
|
||||||
|
|
||||||
|
os.system("rm -rf LOG" + str(device_id))
|
||||||
|
os.system("mkdir ./LOG" + str(device_id))
|
||||||
|
os.system("cp *.py ./LOG" + str(device_id))
|
||||||
|
os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
|
||||||
|
os.system("env > ./LOG" + str(device_id) + "/env.log")
|
||||||
|
|
||||||
|
cur_dir = os.getcwd()
|
||||||
|
os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
|
||||||
|
os.environ["GLOG_logtostderr"] = "0"
|
||||||
|
|
||||||
|
print("core_nums:", cmdopt)
|
||||||
|
print("epoch_size:", str(cfg['epoch_size']))
|
||||||
|
print("data_dir:", data_dir)
|
||||||
|
print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
|
||||||
|
|
||||||
|
cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
|
||||||
|
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
|
||||||
|
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
|
||||||
|
raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
|
||||||
|
" 'device_num' or 'data_dir'! ")
|
||||||
|
cmd += opt
|
||||||
|
cmd += " --data_dir=" + data_dir
|
||||||
|
cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
|
||||||
|
+ str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &'
|
||||||
|
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
distribute_pretrain()
|
|
@ -16,57 +16,16 @@
|
||||||
|
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
|
echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
|
||||||
echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
|
echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
|
echo "For hyper parameter, please note that you should customize the scripts:
|
||||||
|
'{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
|
|
||||||
EPOCH_SIZE=$2
|
|
||||||
DATA_DIR=$3
|
|
||||||
SCHEMA_DIR=$4
|
|
||||||
PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
|
|
||||||
export RANK_TABLE_FILE=$5
|
|
||||||
export RANK_SIZE=$1
|
|
||||||
cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
|
|
||||||
echo "the number of logical core" $cores
|
|
||||||
avg_core_per_rank=`expr $cores \/ $RANK_SIZE`
|
|
||||||
core_gap=`expr $avg_core_per_rank \- 1`
|
|
||||||
echo "avg_core_per_rank" $avg_core_per_rank
|
|
||||||
echo "core_gap" $core_gap
|
|
||||||
for((i=0;i<RANK_SIZE;i++))
|
|
||||||
do
|
|
||||||
start=`expr $i \* $avg_core_per_rank`
|
|
||||||
export DEVICE_ID=$i
|
|
||||||
export RANK_ID=$i
|
|
||||||
export DEPLOY_MODE=0
|
|
||||||
export GE_USE_STATIC_MEMORY=1
|
|
||||||
end=`expr $start \+ $core_gap`
|
|
||||||
cmdopt=$start"-"$end
|
|
||||||
|
|
||||||
rm -rf LOG$i
|
|
||||||
mkdir ./LOG$i
|
|
||||||
cp *.py ./LOG$i
|
|
||||||
cd ./LOG$i || exit
|
|
||||||
echo "start training for rank $i, device $DEVICE_ID"
|
|
||||||
mkdir -p ms_log
|
|
||||||
CUR_DIR=`pwd`
|
CUR_DIR=`pwd`
|
||||||
export GLOG_log_dir=${CUR_DIR}/ms_log
|
|
||||||
export GLOG_logtostderr=0
|
python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \
|
||||||
env > env.log
|
--run_script_dir=${CUR_DIR}/run_pretrain.py \
|
||||||
taskset -c $cmdopt python ${PROJECT_DIR}/../run_pretrain.py \
|
--hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
|
||||||
--distribute="true" \
|
--data_dir=$1 \
|
||||||
--epoch_size=$EPOCH_SIZE \
|
--hccl_config_dir=$2
|
||||||
--device_id=$DEVICE_ID \
|
|
||||||
--device_num=$RANK_SIZE \
|
|
||||||
--enable_save_ckpt="true" \
|
|
||||||
--enable_lossscale="true" \
|
|
||||||
--do_shuffle="true" \
|
|
||||||
--enable_data_sink="true" \
|
|
||||||
--data_sink_steps=100 \
|
|
||||||
--load_checkpoint_path="" \
|
|
||||||
--save_checkpoint_steps=10000 \
|
|
||||||
--save_checkpoint_num=1 \
|
|
||||||
--data_dir=$DATA_DIR \
|
|
||||||
--schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
|
|
||||||
cd ../
|
|
||||||
done
|
|
||||||
|
|
|
@ -0,0 +1,48 @@
|
||||||
|
# Run distribute pretrain
|
||||||
|
|
||||||
|
## description
|
||||||
|
The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that.
|
||||||
|
|
||||||
|
|
||||||
|
## how to use
|
||||||
|
For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
|
||||||
|
```
|
||||||
|
python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
|
||||||
|
```
|
||||||
|
|
||||||
|
output:
|
||||||
|
|
||||||
|
```
|
||||||
|
hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
|
||||||
|
the number of logical core: 192
|
||||||
|
avg_core_per_rank: 96
|
||||||
|
rank_size: 2
|
||||||
|
|
||||||
|
start training for rank 0, device 5:
|
||||||
|
rank_id: 0
|
||||||
|
device_id: 5
|
||||||
|
core nums: 0-95
|
||||||
|
epoch_size: 8
|
||||||
|
data_dir: /data/small_512/
|
||||||
|
schema_dir:
|
||||||
|
log file dir: ./LOG5/log.txt
|
||||||
|
|
||||||
|
start training for rank 1, device 6:
|
||||||
|
rank_id: 1
|
||||||
|
device_id: 6
|
||||||
|
core nums: 96-191
|
||||||
|
epoch_size: 8
|
||||||
|
data_dir: /data/small_512/
|
||||||
|
schema_dir:
|
||||||
|
log file dir: ./LOG6/log.txt
|
||||||
|
```
|
||||||
|
|
||||||
|
## Note
|
||||||
|
|
||||||
|
1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
|
||||||
|
|
||||||
|
2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
|
||||||
|
device_id
|
||||||
|
device_num
|
||||||
|
|
||||||
|
3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.
|
|
@ -0,0 +1,11 @@
|
||||||
|
[config]
|
||||||
|
distribute=true
|
||||||
|
epoch_size=40
|
||||||
|
enable_save_ckpt=true
|
||||||
|
enable_lossscale=true
|
||||||
|
do_shuffle=true
|
||||||
|
enable_data_sink=true
|
||||||
|
data_sink_steps=100
|
||||||
|
save_checkpoint_path=./checkpoint/
|
||||||
|
save_checkpoint_steps=10000
|
||||||
|
save_checkpoint_num=1
|
|
@ -0,0 +1,142 @@
|
||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
"""distribute pretrain script"""
|
||||||
|
import os
|
||||||
|
import json
|
||||||
|
import configparser
|
||||||
|
import multiprocessing
|
||||||
|
from argparse import ArgumentParser
|
||||||
|
|
||||||
|
|
||||||
|
def parse_args():
|
||||||
|
"""
|
||||||
|
parse args .
|
||||||
|
|
||||||
|
Args:
|
||||||
|
|
||||||
|
Returns:
|
||||||
|
args.
|
||||||
|
|
||||||
|
Examples:
|
||||||
|
>>> parse_args()
|
||||||
|
"""
|
||||||
|
parser = ArgumentParser(description="mindspore distributed training")
|
||||||
|
|
||||||
|
parser.add_argument("--run_script_dir", type=str, default="",
|
||||||
|
help="Run script path, it is better to use absolute path")
|
||||||
|
parser.add_argument("--hyper_parameter_config_dir", type=str, default="",
|
||||||
|
help="Hyper Parameter config path, it is better to use absolute path")
|
||||||
|
parser.add_argument("--data_dir", type=str, default="",
|
||||||
|
help="Data path, it is better to use absolute path")
|
||||||
|
parser.add_argument("--hccl_config_dir", type=str, default="",
|
||||||
|
help="Hccl config path, it is better to use absolute path")
|
||||||
|
|
||||||
|
args = parser.parse_args()
|
||||||
|
return args
|
||||||
|
|
||||||
|
|
||||||
|
def distribute_pretrain():
|
||||||
|
"""
|
||||||
|
distribute pretrain scripts. The number of D chips can be automatically allocated
|
||||||
|
based on the device_num set in hccl config file, You don not need to specify that.
|
||||||
|
"""
|
||||||
|
print("start", __file__)
|
||||||
|
args = parse_args()
|
||||||
|
|
||||||
|
run_script = args.run_script_dir
|
||||||
|
data_dir = args.data_dir
|
||||||
|
cf = configparser.ConfigParser()
|
||||||
|
cf.read(args.hyper_parameter_config_dir)
|
||||||
|
cfg = dict(cf.items("config"))
|
||||||
|
|
||||||
|
print("hccl_config_dir:", args.hccl_config_dir)
|
||||||
|
os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir
|
||||||
|
os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
|
||||||
|
|
||||||
|
cores = multiprocessing.cpu_count()
|
||||||
|
print("the number of logical core:", cores)
|
||||||
|
|
||||||
|
# get device_ips
|
||||||
|
device_ips = {}
|
||||||
|
with open('/etc/hccn.conf', 'r') as fin:
|
||||||
|
for hccn_item in fin.readlines():
|
||||||
|
if hccn_item.strip().startswith('address_'):
|
||||||
|
device_id, device_ip = hccn_item.split('=')
|
||||||
|
device_id = device_id.split('_')[1]
|
||||||
|
device_ips[device_id] = device_ip.strip()
|
||||||
|
|
||||||
|
with open(args.hccl_config_dir, "r", encoding="utf-8") as fin:
|
||||||
|
hccl_config = json.loads(fin.read())
|
||||||
|
rank_size = 0
|
||||||
|
for server in hccl_config["server_list"]:
|
||||||
|
rank_size += len(server["device"])
|
||||||
|
if server["device"][0]["device_ip"] in device_ips.values():
|
||||||
|
this_server = server
|
||||||
|
|
||||||
|
os.environ['RANK_SIZE'] = str(rank_size)
|
||||||
|
print("total rank size:", rank_size)
|
||||||
|
print("this server rank size:", len(this_server["device"]))
|
||||||
|
avg_core_per_rank = int(int(cores) / len(this_server["device"]))
|
||||||
|
core_gap = avg_core_per_rank - 1
|
||||||
|
print("avg_core_per_rank:", avg_core_per_rank)
|
||||||
|
|
||||||
|
count = 0
|
||||||
|
for instance in this_server["device"]:
|
||||||
|
device_id = instance["device_id"]
|
||||||
|
rank_id = instance["rank_id"]
|
||||||
|
print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":")
|
||||||
|
print("rank_id:", rank_id)
|
||||||
|
print("device_id:", device_id)
|
||||||
|
|
||||||
|
start = count * int(avg_core_per_rank)
|
||||||
|
count += 1
|
||||||
|
end = start + core_gap
|
||||||
|
cmdopt = str(start) + "-" + str(end)
|
||||||
|
|
||||||
|
os.environ["DEVICE_ID"] = device_id
|
||||||
|
os.environ["RANK_ID"] = rank_id
|
||||||
|
os.environ["DEPLOY_MODE"] = "0"
|
||||||
|
os.environ["GE_USE_STATIC_MEMORY"] = "1"
|
||||||
|
|
||||||
|
os.system("rm -rf LOG" + str(device_id))
|
||||||
|
os.system("mkdir ./LOG" + str(device_id))
|
||||||
|
os.system("cp *.py ./LOG" + str(device_id))
|
||||||
|
os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
|
||||||
|
os.system("env > ./LOG" + str(device_id) + "/env.log")
|
||||||
|
|
||||||
|
cur_dir = os.getcwd()
|
||||||
|
os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
|
||||||
|
os.environ["GLOG_logtostderr"] = "0"
|
||||||
|
|
||||||
|
print("core_nums:", cmdopt)
|
||||||
|
print("epoch_size:", str(cfg['epoch_size']))
|
||||||
|
print("data_dir:", data_dir)
|
||||||
|
print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
|
||||||
|
|
||||||
|
cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
|
||||||
|
opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
|
||||||
|
if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
|
||||||
|
raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
|
||||||
|
" 'device_num' or 'data_dir'! ")
|
||||||
|
cmd += opt
|
||||||
|
cmd += " --data_dir=" + data_dir
|
||||||
|
cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
|
||||||
|
+ str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &'
|
||||||
|
|
||||||
|
os.system(cmd)
|
||||||
|
|
||||||
|
|
||||||
|
if __name__ == "__main__":
|
||||||
|
distribute_pretrain()
|
|
@ -17,7 +17,6 @@ import os
|
||||||
import sys
|
import sys
|
||||||
import json
|
import json
|
||||||
import socket
|
import socket
|
||||||
import platform
|
|
||||||
from argparse import ArgumentParser
|
from argparse import ArgumentParser
|
||||||
from typing import Dict, Any
|
from typing import Dict, Any
|
||||||
|
|
||||||
|
@ -114,40 +113,25 @@ def main():
|
||||||
device_id = device_id.split('_')[1]
|
device_id = device_id.split('_')[1]
|
||||||
device_ips[device_id] = device_ip.strip()
|
device_ips[device_id] = device_ip.strip()
|
||||||
|
|
||||||
arch = platform.processor()
|
hccn_table = {'version': '1.0',
|
||||||
hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch],
|
'server_count': '1',
|
||||||
'chip_info': '910',
|
'server_list': []}
|
||||||
'deploy_mode': 'lab',
|
device_list = []
|
||||||
'group_count': '1',
|
|
||||||
'group_list': []}
|
|
||||||
instance_list = []
|
|
||||||
rank_id = 0
|
rank_id = 0
|
||||||
for instance_id in device_num_list:
|
for instance_id in device_num_list:
|
||||||
instance = {'devices': []}
|
|
||||||
device_id = visible_devices[instance_id]
|
device_id = visible_devices[instance_id]
|
||||||
device_ip = device_ips[device_id]
|
device_ip = device_ips[device_id]
|
||||||
instance['devices'].append({
|
device = {'device_id': device_id,
|
||||||
'device_id': device_id,
|
|
||||||
'device_ip': device_ip,
|
'device_ip': device_ip,
|
||||||
})
|
'rank_id': str(rank_id)}
|
||||||
print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
|
print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
|
||||||
instance['rank_id'] = str(rank_id)
|
|
||||||
rank_id += 1
|
rank_id += 1
|
||||||
instance['server_id'] = server_id
|
device_list.append(device)
|
||||||
instance_list.append(instance)
|
hccn_table['server_list'].append({
|
||||||
hccn_table['group_list'].append({
|
'server_id': server_id,
|
||||||
'device_num': str(len(device_num_list)),
|
'device': device_list,
|
||||||
'server_num': '1',
|
'host_nic_ip': 'reserve'
|
||||||
'group_name': '',
|
|
||||||
'instance_count': str(len(device_num_list)),
|
|
||||||
'instance_list': instance_list,
|
|
||||||
})
|
})
|
||||||
hccn_table['para_plane_nic_location'] = 'device'
|
|
||||||
hccn_table['para_plane_nic_name'] = []
|
|
||||||
for instance_id in device_num_list:
|
|
||||||
eth_id = visible_devices[instance_id]
|
|
||||||
hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
|
|
||||||
hccn_table['para_plane_nic_num'] = str(len(device_num_list))
|
|
||||||
hccn_table['status'] = 'completed'
|
hccn_table['status'] = 'completed'
|
||||||
|
|
||||||
# save hccn_table to file
|
# save hccn_table to file
|
||||||
|
|
Loading…
Reference in New Issue