add_python_distribute_pretrain_script

Signed-off-by: GuoMengHao <guomenghao@huawei.com>
2020-07-27 11:21:43 +08:00 · 2020-07-27 11:21:43 +08:00 · 2309e7369a
parent c22792aab1
commit 2309e7369a
11 changed files with 425 additions and 80 deletions
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
 - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.

    ``` bash   
-    sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
+    sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
    ```  

 ### Fine-Tuning and Evaluation
--- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md
+++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/README.md
@ -0,0 +1,48 @@
+# Run distribute pretrain
+
+## description
+The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that.
+
+
+## how to use
+For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
+```
+python ./scripts/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir ./scripts/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
+```
+
+output:
+
+```
+hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
+the number of logical core: 192
+avg_core_per_rank: 96
+rank_size: 2
+
+start training for rank 0, device 5:
+rank_id: 0
+device_id: 5
+core nums: 0-95
+epoch_size: 8
+data_dir: /data/small_512/
+schema_dir:
+log file dir: ./LOG5/log.txt
+
+start training for rank 1, device 6:
+rank_id: 1
+device_id: 6
+core nums: 96-191
+epoch_size: 8
+data_dir: /data/small_512/
+schema_dir:
+log file dir: ./LOG6/log.txt
+```
+
+## Note
+
+1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
+
+2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
+    device_id
+    device_num
+
+3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.
--- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/init.py
+++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/init.py
--- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini
+++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/hyper_parameter_config.ini
@ -0,0 +1,11 @@
+[config]
+distribute=true
+epoch_size=40
+enable_save_ckpt=true
+enable_lossscale=true
+do_shuffle=true
+enable_data_sink=true
+data_sink_steps=100
+save_checkpoint_path=./checkpoint/
+save_checkpoint_steps=10000
+save_checkpoint_num=1
--- a/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
+++ b/model_zoo/official/nlp/bert/scripts/ascend_distributed_launcher/run_distribute_pretrain.py
@ -0,0 +1,142 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""distribute pretrain script"""
+import os
+import json
+import configparser
+import multiprocessing
+from argparse import ArgumentParser
+
+
+def parse_args():
+    """
+    parse args .
+
+    Args:
+
+    Returns:
+        args.
+
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training")
+
+    parser.add_argument("--run_script_dir", type=str, default="",
+                        help="Run script path, it is better to use absolute path")
+    parser.add_argument("--hyper_parameter_config_dir", type=str, default="",
+                        help="Hyper Parameter config path, it is better to use absolute path")
+    parser.add_argument("--data_dir", type=str, default="",
+                        help="Data path, it is better to use absolute path")
+    parser.add_argument("--hccl_config_dir", type=str, default="",
+                        help="Hccl config path, it is better to use absolute path")
+
+    args = parser.parse_args()
+    return args
+
+
+def distribute_pretrain():
+    """
+    distribute pretrain scripts. The number of D chips can be automatically allocated
+    based on the device_num set in hccl config file, You don not need to specify that.
+    """
+    print("start", __file__)
+    args = parse_args()
+
+    run_script = args.run_script_dir
+    data_dir = args.data_dir
+    cf = configparser.ConfigParser()
+    cf.read(args.hyper_parameter_config_dir)
+    cfg = dict(cf.items("config"))
+
+    print("hccl_config_dir:", args.hccl_config_dir)
+    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir
+    os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
+
+    cores = multiprocessing.cpu_count()
+    print("the number of logical core:", cores)
+
+    # get device_ips
+    device_ips = {}
+    with open('/etc/hccn.conf', 'r') as fin:
+        for hccn_item in fin.readlines():
+            if hccn_item.strip().startswith('address_'):
+                device_id, device_ip = hccn_item.split('=')
+                device_id = device_id.split('_')[1]
+                device_ips[device_id] = device_ip.strip()
+
+    with open(args.hccl_config_dir, "r", encoding="utf-8") as fin:
+        hccl_config = json.loads(fin.read())
+        rank_size = 0
+        for server in hccl_config["server_list"]:
+            rank_size += len(server["device"])
+            if server["device"][0]["device_ip"] in device_ips.values():
+                this_server = server
+
+    os.environ['RANK_SIZE'] = str(rank_size)
+    print("total rank size:", rank_size)
+    print("this server rank size:", len(this_server["device"]))
+    avg_core_per_rank = int(int(cores) / len(this_server["device"]))
+    core_gap = avg_core_per_rank - 1
+    print("avg_core_per_rank:", avg_core_per_rank)
+
+    count = 0
+    for instance in this_server["device"]:
+        device_id = instance["device_id"]
+        rank_id = instance["rank_id"]
+        print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":")
+        print("rank_id:", rank_id)
+        print("device_id:", device_id)
+
+        start = count * int(avg_core_per_rank)
+        count += 1
+        end = start + core_gap
+        cmdopt = str(start) + "-" + str(end)
+
+        os.environ["DEVICE_ID"] = device_id
+        os.environ["RANK_ID"] = rank_id
+        os.environ["DEPLOY_MODE"] = "0"
+        os.environ["GE_USE_STATIC_MEMORY"] = "1"
+
+        os.system("rm -rf LOG" + str(device_id))
+        os.system("mkdir ./LOG" + str(device_id))
+        os.system("cp *.py ./LOG" + str(device_id))
+        os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
+        os.system("env > ./LOG" + str(device_id) + "/env.log")
+
+        cur_dir = os.getcwd()
+        os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
+        os.environ["GLOG_logtostderr"] = "0"
+
+        print("core_nums:", cmdopt)
+        print("epoch_size:", str(cfg['epoch_size']))
+        print("data_dir:", data_dir)
+        print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
+
+        cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
+        opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
+        if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
+            raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
+                             " 'device_num' or 'data_dir'! ")
+        cmd += opt
+        cmd += " --data_dir=" + data_dir
+        cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
+               + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &'
+
+        os.system(cmd)
+
+
+if __name__ == "__main__":
+    distribute_pretrain()
--- a/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_distribute_pretrain.sh
@ -16,57 +16,16 @@

 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
-echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
+echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
+echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
 echo "It is better to use absolute path."
+echo "For hyper parameter, please note that you should customize the scripts:
+          '{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
 echo "=============================================================================================================="
-
-EPOCH_SIZE=$2
-DATA_DIR=$3
-SCHEMA_DIR=$4
-PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
-export RANK_TABLE_FILE=$5
-export RANK_SIZE=$1
-cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
-echo "the number of logical core" $cores
-avg_core_per_rank=`expr $cores \/ $RANK_SIZE`
-core_gap=`expr $avg_core_per_rank \- 1`
-echo "avg_core_per_rank" $avg_core_per_rank
-echo "core_gap" $core_gap
-for((i=0;i<RANK_SIZE;i++))
-do
-    start=`expr $i \* $avg_core_per_rank`
-    export DEVICE_ID=$i
-    export RANK_ID=$i
-    export DEPLOY_MODE=0
-    export GE_USE_STATIC_MEMORY=1
-    end=`expr $start \+ $core_gap`
-    cmdopt=$start"-"$end
-
-    rm -rf LOG$i
-    mkdir ./LOG$i
-    cp  *.py ./LOG$i
-    cd ./LOG$i || exit
-    echo "start training for rank $i, device $DEVICE_ID"
-    mkdir -p ms_log
 CUR_DIR=`pwd`
-    export GLOG_log_dir=${CUR_DIR}/ms_log
-    export GLOG_logtostderr=0
-    env > env.log
-    taskset -c $cmdopt python ${PROJECT_DIR}/../run_pretrain.py  \
-    --distribute="true" \
-    --epoch_size=$EPOCH_SIZE \
-    --device_id=$DEVICE_ID \
-    --device_num=$RANK_SIZE \
-    --enable_save_ckpt="true" \
-    --enable_lossscale="true" \
-    --do_shuffle="true" \
-    --enable_data_sink="true" \
-    --data_sink_steps=100 \
-    --load_checkpoint_path="" \
-    --save_checkpoint_steps=10000 \
-    --save_checkpoint_num=1 \
-    --data_dir=$DATA_DIR \
-    --schema_dir=$SCHEMA_DIR > log.txt 2>&1 &
-    cd ../
-done
+
+python ${CUR_DIR}/scripts/ascend_distributed_launcher/run_distribute_pretrain.py \
+    --run_script_dir=${CUR_DIR}/run_pretrain.py \
+    --hyper_parameter_config_dir=${CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini \
+    --data_dir=$1 \
+    --hccl_config_dir=$2
--- a/model_zoo/utils/ascend_distributed_launcher/README.md
+++ b/model_zoo/utils/ascend_distributed_launcher/README.md
@ -0,0 +1,48 @@
+# Run distribute pretrain
+
+## description
+The number of D chips can be automatically allocated based on the device_num set in hccl config file, You don not need to specify that.
+
+
+## how to use
+For example, if we want to run the distributed training of Bert model on D chip, we can in `/bert/` dir:
+```
+python model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py --run_script_dir ./run_pretrain.py --hyper_parameter_config_dir model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini --data_dir /path/dataset/ --hccl_config_dir model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
+```
+
+output:
+
+```
+hccl_config_dir: model_zoo/utils/hccl_tools/hccl_2p_56_x.x.x.x.json
+the number of logical core: 192
+avg_core_per_rank: 96
+rank_size: 2
+
+start training for rank 0, device 5:
+rank_id: 0
+device_id: 5
+core nums: 0-95
+epoch_size: 8
+data_dir: /data/small_512/
+schema_dir:
+log file dir: ./LOG5/log.txt
+
+start training for rank 1, device 6:
+rank_id: 1
+device_id: 6
+core nums: 96-191
+epoch_size: 8
+data_dir: /data/small_512/
+schema_dir:
+log file dir: ./LOG6/log.txt
+```
+
+## Note
+
+1. Note that `hccl_2p_56_x.x.x.x.json` can use [hccl_tools.py](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools) to generate.
+
+2. For hyper parameter, please note that you should customize the scripts `hyper_parameter_config.ini`. Please note that these two hyper parameters are not allowed to be configured here:
+    device_id
+    device_num
+
+3. For Other Model, please note that you should customize the option `run_script` and Corresponding `hyper_parameter_config.ini`.
--- a/model_zoo/utils/ascend_distributed_launcher/init.py
+++ b/model_zoo/utils/ascend_distributed_launcher/init.py
--- a/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini
+++ b/model_zoo/utils/ascend_distributed_launcher/hyper_parameter_config.ini
@ -0,0 +1,11 @@
+[config]
+distribute=true
+epoch_size=40
+enable_save_ckpt=true
+enable_lossscale=true
+do_shuffle=true
+enable_data_sink=true
+data_sink_steps=100
+save_checkpoint_path=./checkpoint/
+save_checkpoint_steps=10000
+save_checkpoint_num=1
--- a/model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py
+++ b/model_zoo/utils/ascend_distributed_launcher/run_distribute_pretrain.py
@ -0,0 +1,142 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+"""distribute pretrain script"""
+import os
+import json
+import configparser
+import multiprocessing
+from argparse import ArgumentParser
+
+
+def parse_args():
+    """
+    parse args .
+
+    Args:
+
+    Returns:
+        args.
+
+    Examples:
+        >>> parse_args()
+    """
+    parser = ArgumentParser(description="mindspore distributed training")
+
+    parser.add_argument("--run_script_dir", type=str, default="",
+                        help="Run script path, it is better to use absolute path")
+    parser.add_argument("--hyper_parameter_config_dir", type=str, default="",
+                        help="Hyper Parameter config path, it is better to use absolute path")
+    parser.add_argument("--data_dir", type=str, default="",
+                        help="Data path, it is better to use absolute path")
+    parser.add_argument("--hccl_config_dir", type=str, default="",
+                        help="Hccl config path, it is better to use absolute path")
+
+    args = parser.parse_args()
+    return args
+
+
+def distribute_pretrain():
+    """
+    distribute pretrain scripts. The number of D chips can be automatically allocated
+    based on the device_num set in hccl config file, You don not need to specify that.
+    """
+    print("start", __file__)
+    args = parse_args()
+
+    run_script = args.run_script_dir
+    data_dir = args.data_dir
+    cf = configparser.ConfigParser()
+    cf.read(args.hyper_parameter_config_dir)
+    cfg = dict(cf.items("config"))
+
+    print("hccl_config_dir:", args.hccl_config_dir)
+    os.environ['MINDSPORE_HCCL_CONFIG_PATH'] = args.hccl_config_dir
+    os.environ['RANK_TABLE_FILE'] = args.hccl_config_dir
+
+    cores = multiprocessing.cpu_count()
+    print("the number of logical core:", cores)
+
+    # get device_ips
+    device_ips = {}
+    with open('/etc/hccn.conf', 'r') as fin:
+        for hccn_item in fin.readlines():
+            if hccn_item.strip().startswith('address_'):
+                device_id, device_ip = hccn_item.split('=')
+                device_id = device_id.split('_')[1]
+                device_ips[device_id] = device_ip.strip()
+
+    with open(args.hccl_config_dir, "r", encoding="utf-8") as fin:
+        hccl_config = json.loads(fin.read())
+        rank_size = 0
+        for server in hccl_config["server_list"]:
+            rank_size += len(server["device"])
+            if server["device"][0]["device_ip"] in device_ips.values():
+                this_server = server
+
+    os.environ['RANK_SIZE'] = str(rank_size)
+    print("total rank size:", rank_size)
+    print("this server rank size:", len(this_server["device"]))
+    avg_core_per_rank = int(int(cores) / len(this_server["device"]))
+    core_gap = avg_core_per_rank - 1
+    print("avg_core_per_rank:", avg_core_per_rank)
+
+    count = 0
+    for instance in this_server["device"]:
+        device_id = instance["device_id"]
+        rank_id = instance["rank_id"]
+        print("\nstart training for rank " + str(rank_id) + ", device " + str(device_id) + ":")
+        print("rank_id:", rank_id)
+        print("device_id:", device_id)
+
+        start = count * int(avg_core_per_rank)
+        count += 1
+        end = start + core_gap
+        cmdopt = str(start) + "-" + str(end)
+
+        os.environ["DEVICE_ID"] = device_id
+        os.environ["RANK_ID"] = rank_id
+        os.environ["DEPLOY_MODE"] = "0"
+        os.environ["GE_USE_STATIC_MEMORY"] = "1"
+
+        os.system("rm -rf LOG" + str(device_id))
+        os.system("mkdir ./LOG" + str(device_id))
+        os.system("cp *.py ./LOG" + str(device_id))
+        os.system("mkdir -p ./LOG" + str(device_id) + "/ms_log")
+        os.system("env > ./LOG" + str(device_id) + "/env.log")
+
+        cur_dir = os.getcwd()
+        os.environ["GLOG_log_dir"] = cur_dir + "/LOG" + str(device_id) + "/ms_log"
+        os.environ["GLOG_logtostderr"] = "0"
+
+        print("core_nums:", cmdopt)
+        print("epoch_size:", str(cfg['epoch_size']))
+        print("data_dir:", data_dir)
+        print("log_file_dir: ./LOG" + str(device_id) + "/log.txt")
+
+        cmd = 'taskset -c ' + cmdopt + ' python ' + run_script + " "
+        opt = " ".join(["--" + key + "=" + str(cfg[key]) for key in cfg.keys()])
+        if ('device_id' in opt) or ('device_num' in opt) or ('data_dir' in opt):
+            raise ValueError("hyper_parameter_config.ini can not setting 'device_id',"
+                             " 'device_num' or 'data_dir'! ")
+        cmd += opt
+        cmd += " --data_dir=" + data_dir
+        cmd += ' --device_id=' + str(device_id) + ' --device_num=' \
+               + str(rank_size) + ' >./LOG' + str(device_id) + '/log.txt 2>&1 &'
+
+        os.system(cmd)
+
+
+if __name__ == "__main__":
+    distribute_pretrain()
--- a/model_zoo/utils/hccl_tools/hccl_tools.py
+++ b/model_zoo/utils/hccl_tools/hccl_tools.py
@ -17,7 +17,6 @@ import os
 import sys
 import json
 import socket
-import platform
 from argparse import ArgumentParser
 from typing import Dict, Any

@ -114,40 +113,25 @@ def main():
                device_id = device_id.split('_')[1]
                device_ips[device_id] = device_ip.strip()

-    arch = platform.processor()
-    hccn_table = {'board_id': {'aarch64': '0x002f', 'x86_64': '0x0000'}[arch],
-                  'chip_info': '910',
-                  'deploy_mode': 'lab',
-                  'group_count': '1',
-                  'group_list': []}
-    instance_list = []
+    hccn_table = {'version': '1.0',
+                  'server_count': '1',
+                  'server_list': []}
+    device_list = []
    rank_id = 0
    for instance_id in device_num_list:
-        instance = {'devices': []}
        device_id = visible_devices[instance_id]
        device_ip = device_ips[device_id]
-        instance['devices'].append({
-            'device_id': device_id,
+        device = {'device_id': device_id,
                  'device_ip': device_ip,
-        })
+                  'rank_id': str(rank_id)}
        print('rank_id:{}, device_id:{}, device_ip:{}'.format(rank_id, device_id, device_ip))
-        instance['rank_id'] = str(rank_id)
        rank_id += 1
-        instance['server_id'] = server_id
-        instance_list.append(instance)
-    hccn_table['group_list'].append({
-        'device_num': str(len(device_num_list)),
-        'server_num': '1',
-        'group_name': '',
-        'instance_count': str(len(device_num_list)),
-        'instance_list': instance_list,
+        device_list.append(device)
+    hccn_table['server_list'].append({
+        'server_id': server_id,
+        'device': device_list,
+        'host_nic_ip': 'reserve'
    })
-    hccn_table['para_plane_nic_location'] = 'device'
-    hccn_table['para_plane_nic_name'] = []
-    for instance_id in device_num_list:
-        eth_id = visible_devices[instance_id]
-        hccn_table['para_plane_nic_name'].append('eth{}'.format(eth_id))
-    hccn_table['para_plane_nic_num'] = str(len(device_num_list))
    hccn_table['status'] = 'completed'

    # save hccn_table to file