remove old MINDSPORE_HCCL_CONFIG_PATH in model zoo

This commit is contained in:
panbingao 2020-07-29 17:48:09 +08:00
parent 1b69923472
commit 3e82ae7f51
37 changed files with 62 additions and 73 deletions

View File

@ -21,7 +21,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash ``` bash
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE
``` ```
## Usage ## Usage

View File

@ -16,7 +16,7 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE"
echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json" echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
@ -26,7 +26,6 @@ DATA_DIR=$3
SCHEMA_DIR=$4 SCHEMA_DIR=$4
ulimit -u unlimited ulimit -u unlimited
export MINDSPORE_HCCL_CONFIG_PATH=$5
export RANK_TABLE_FILE=$5 export RANK_TABLE_FILE=$5
export RANK_SIZE=$1 export RANK_SIZE=$1
export HCCL_CONNECT_TIMEOUT=300 export HCCL_CONNECT_TIMEOUT=300

View File

@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
### Usage ### Usage
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH] - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
### Launch ### Launch

View File

@ -30,7 +30,6 @@ run_ascend()
BASEPATH=$(cd "`dirname $0`" || exit; pwd) BASEPATH=$(cd "`dirname $0`" || exit; pwd)
export PYTHONPATH=${BASEPATH}:$PYTHONPATH export PYTHONPATH=${BASEPATH}:$PYTHONPATH
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
if [ -d "../train" ]; if [ -d "../train" ];
then then
@ -81,7 +80,7 @@ run_gpu()
if [ $# -gt 6 ] || [ $# -lt 4 ] if [ $# -gt 6 ] || [ $# -lt 4 ]
then then
echo "Usage:\n \ echo "Usage:\n \
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \ Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]\n \
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
" "
exit 1 exit 1

View File

@ -141,7 +141,6 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)

View File

@ -138,7 +138,7 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)

View File

@ -25,7 +25,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo
``` ```
- Run `run_distribute_train.sh` for distributed training. - Run `run_distribute_train.sh` for distributed training.
``` bash ``` bash
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH sh scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
``` ```
### Evaluation ### Evaluation
Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path. Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.

View File

@ -16,14 +16,13 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH" echo "bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH"
echo "for example: bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)" echo "for example: bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH [PRETRAINED_CKPT_PATH](option)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
DATA_DIR=$2 DATA_DIR=$2
export MINDSPORE_HCCL_CONFIG_PATH=$1
export RANK_TABLE_FILE=$1 export RANK_TABLE_FILE=$1
export RANK_SIZE=8 export RANK_SIZE=8
PATH_CHECKPOINT="" PATH_CHECKPOINT=""

View File

@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training # standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL] sh run_standalone_train.sh [PRETRAINED_MODEL]
``` ```
> Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). > Rank_table.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
> As for PRETRAINED_MODELif not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned. > As for PRETRAINED_MODELif not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned.
#### Result #### Result

View File

@ -16,7 +16,7 @@
if [ $# -lt 1 ] || [ $# -gt 2 ] if [ $# -lt 1 ] || [ $# -gt 2 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1 exit 1
fi fi
@ -33,7 +33,7 @@ echo $PATH1
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -51,7 +51,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))

View File

@ -16,22 +16,22 @@
if [ $# != 1 ] if [ $# != 1 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: RANK_TABLE_FILE=$1 is not a file"
exit 1 exit 1
fi fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1) RANK_TABLE_FILE=$(realpath $1)
export MINDSPORE_HCCL_CONFIG_PATH export RANK_TABLE_FILE
echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}" echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
export SERVER_ID=0 export SERVER_ID=0
rank_start=$((DEVICE_NUM * SERVER_ID)) rank_start=$((DEVICE_NUM * SERVER_ID))

View File

@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
# standalone training # standalone training
sh run_standalone_train.sh [PRETRAINED_MODEL] sh run_standalone_train.sh [PRETRAINED_MODEL]

View File

@ -16,7 +16,7 @@
if [ $# != 2 ] if [ $# != 2 ]
then then
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
exit 1 exit 1
fi fi
@ -35,7 +35,7 @@ echo $PATH2
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -48,7 +48,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
echo 3 > /proc/sys/vm/drop_caches echo 3 > /proc/sys/vm/drop_caches

View File

@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py.
``` ```
# distributed training # distributed training
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH]
[PRETRAINED_CKPT_PATH](optional) [PRETRAINED_CKPT_PATH](optional)
# standalone training # standalone training

View File

@ -16,7 +16,7 @@
if [ $# != 4 ] && [ $# != 5 ] if [ $# != 4 ] && [ $# != 5 ]
then then
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
exit 1 exit 1
fi fi
@ -57,7 +57,7 @@ fi
if [ ! -f $PATH1 ] if [ ! -f $PATH1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -76,7 +76,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))

View File

@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py
``` ```
# distribute training example(8p) # distribute training example(8p)
sh run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
# standalone training # standalone training
sh run_standalone_train.sh DEVICE_ID DATA_PATH sh run_standalone_train.sh DEVICE_ID DATA_PATH
``` ```
@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH
```bash ```bash
# distributed training example(8p) for Ascend # distributed training example(8p) for Ascend
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH /dataset/train sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
# standalone training example for Ascend # standalone training example for Ascend
sh scripts/run_standalone_train.sh 0 /dataset/train sh scripts/run_standalone_train.sh 0 /dataset/train

View File

@ -16,7 +16,7 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)" echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "=================================================================================================================" echo "================================================================================================================="
@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 5 ] && [ $# != 7 ] if [ $# != 5 ] && [ $# != 7 ]
then then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \ echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
@ -41,7 +41,7 @@ LR=$3
DATASET=$4 DATASET=$4
PRE_TRAINED=$6 PRE_TRAINED=$6
PRE_TRAINED_EPOCH_SIZE=$7 PRE_TRAINED_EPOCH_SIZE=$7
export MINDSPORE_HCCL_CONFIG_PATH=$5 export RANK_TABLE_FILE=$5
for((i=0;i<RANK_SIZE;i++)) for((i=0;i<RANK_SIZE;i++))
do do

View File

@ -101,9 +101,9 @@ parameters/options:
### Distribute Training ### Distribute Training
``` ```
Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH] Usage: sh script/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
parameters/options: parameters/options:
MINDSPORE_HCCL_CONFIG_PATH HCCL configuration file path. RANK_TABLE_FILE HCCL configuration file path.
DATA_PATH the storage path of dataset. DATA_PATH the storage path of dataset.
``` ```

View File

@ -16,13 +16,13 @@
if [ $# != 2 ] if [ $# != 2 ]
then then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: RANK_TABLE_FILEH=$1 is not a file"
exit 1 exit 1
fi fi
@ -34,7 +34,7 @@ fi
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$1 export RANK_TABLE_FILE=$1
for((i=0;i<RANK_SIZE;i++)) for((i=0;i<RANK_SIZE;i++))
do do

View File

@ -77,7 +77,7 @@ Parameters for both training and evaluation can be set in config.py.
``` ```
# distributed training in Ascend # distributed training in Ascend
Usage: bash run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]
# distributed training in GPU # distributed training in GPU
Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH] Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH]

View File

@ -15,7 +15,7 @@
# ============================================================================ # ============================================================================
if [ $# != 2 ]; then if [ $# != 2 ]; then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]"
exit 1 exit 1
fi fi
@ -31,7 +31,7 @@ PATH1=$(get_real_path $1)
PATH2=$(get_real_path $2) PATH2=$(get_real_path $2)
if [ ! -f $PATH1 ]; then if [ ! -f $PATH1 ]; then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
exit 1 exit 1
fi fi
@ -43,7 +43,6 @@ fi
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
export RANK_TABLE_FILE=$PATH1 export RANK_TABLE_FILE=$PATH1
for ((i = 0; i < ${DEVICE_NUM}; i++)); do for ((i = 0; i < ${DEVICE_NUM}; i++)); do

View File

@ -55,7 +55,7 @@ This is an example of training YOLOV3-DarkNet53 with COCO2014 dataset in MindSpo
``` ```
# distributed training # distributed training
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH] sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]
# standalone training # standalone training
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE]

View File

@ -16,7 +16,7 @@
if [ $# != 3 ] if [ $# != 3 ]
then then
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]" echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]"
exit 1 exit 1
fi fi
@ -30,10 +30,10 @@ get_real_path(){
DATASET_PATH=$(get_real_path $1) DATASET_PATH=$(get_real_path $1)
PRETRAINED_BACKBONE=$(get_real_path $2) PRETRAINED_BACKBONE=$(get_real_path $2)
MINDSPORE_HCCL_CONFIG_PATH=$(get_real_path $3) RANK_TABLE_FILE=$(get_real_path $3)
echo $DATASET_PATH echo $DATASET_PATH
echo $PRETRAINED_BACKBONE echo $PRETRAINED_BACKBONE
echo $MINDSPORE_HCCL_CONFIG_PATH echo $RANK_TABLE_FILE
if [ ! -d $DATASET_PATH ] if [ ! -d $DATASET_PATH ]
then then
@ -47,15 +47,15 @@ then
exit 1 exit 1
fi fi
if [ ! -f $MINDSPORE_HCCL_CONFIG_PATH ] if [ ! -f $RANK_TABLE_FILE ]
then then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH is not a file" echo "error: RANK_TABLE_FILE=$RANK_TABLE_FILE is not a file"
exit 1 exit 1
fi fi
export DEVICE_NUM=8 export DEVICE_NUM=8
export RANK_SIZE=8 export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH export RANK_TABLE_FILEH=$RANK_TABLE_FILE
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
do do

View File

@ -16,7 +16,7 @@
echo "=======================================================================================================================================================" echo "======================================================================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)" echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script." echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
@ -24,7 +24,7 @@ echo "==========================================================================
if [ $# != 6 ] && [ $# != 8 ] if [ $# != 6 ] && [ $# != 8 ]
then then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \ echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [RANK_TABLE_FILE] \
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1 exit 1
fi fi
@ -42,7 +42,7 @@ python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image
echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt" echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
export MINDSPORE_HCCL_CONFIG_PATH=$6 export RANK_TABLE_FILE=$6
export RANK_SIZE=$1 export RANK_SIZE=$1
BASE_PATH=$(cd "`dirname $0`" || exit; pwd) BASE_PATH=$(cd "`dirname $0`" || exit; pwd)

View File

@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
``` bash ``` bash
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
``` ```
### Fine-Tuning and Evaluation ### Fine-Tuning and Evaluation

View File

@ -16,9 +16,11 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json" echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "For hyper parameter, please note that you should customize the scripts:
'{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
echo "==============================================================================================================" echo "=============================================================================================================="
EPOCH_SIZE=$2 EPOCH_SIZE=$2

View File

@ -44,7 +44,6 @@ set_hccl_json()
do do
if [[ "$1" == "-j" || "$1" == "--hccl_json" ]] if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
then then
export MINDSPORE_HCCL_CONFIG_PATH=$2
export RANK_TABLE_FILE=$2 export RANK_TABLE_FILE=$2
break break
fi fi

View File

@ -74,7 +74,7 @@ This example implements training and evaluation of Transformer Model, which is i
- Run `run_distribute_train.sh` for distributed training of Transformer model. - Run `run_distribute_train.sh` for distributed training of Transformer model.
``` bash ``` bash
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE
``` ```
### Evaluation ### Evaluation

View File

@ -16,7 +16,7 @@
echo "==============================================================================================================" echo "=============================================================================================================="
echo "Please run the scipt as: " echo "Please run the scipt as: "
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH" echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json" echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
echo "It is better to use absolute path." echo "It is better to use absolute path."
echo "==============================================================================================================" echo "=============================================================================================================="
@ -28,7 +28,6 @@ cd run_distribute_train || exit
EPOCH_SIZE=$2 EPOCH_SIZE=$2
DATA_PATH=$3 DATA_PATH=$3
export MINDSPORE_HCCL_CONFIG_PATH=$4
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export RANK_SIZE=$1 export RANK_SIZE=$1
export HCCL_FLAG=1 export HCCL_FLAG=1

View File

@ -21,7 +21,7 @@ echo "After running the script, the network runs in the background, The log will
export RANK_SIZE=$1 export RANK_SIZE=$1
DATA_URL=$2 DATA_URL=$2
export MINDSPORE_HCCL_CONFIG_PATH=$3 export RANK_TABLE_FILE=$3
for ((i=0; i<RANK_SIZE;i++)) for ((i=0; i<RANK_SIZE;i++))
do do

View File

@ -22,7 +22,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2 export EPOCH_SIZE=$2
export DATASET=$3 export DATASET=$3
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
for((i=0;i<$RANK_SIZE;i++)); for((i=0;i<$RANK_SIZE;i++));
do do

View File

@ -22,7 +22,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2 export EPOCH_SIZE=$2
export DATASET=$3 export DATASET=$3
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
for((i=0;i<$RANK_SIZE;i++)); for((i=0;i<$RANK_SIZE;i++));
do do

View File

@ -21,7 +21,6 @@ export RANK_SIZE=$1
export EPOCH_SIZE=$2 export EPOCH_SIZE=$2
export DATASET=$3 export DATASET=$3
export RANK_TABLE_FILE=$4 export RANK_TABLE_FILE=$4
export MINDSPORE_HCCL_CONFIG_PATH=$4
export MS_COMM_TYPE=zmq export MS_COMM_TYPE=zmq
export MS_SCHED_NUM=1 export MS_SCHED_NUM=1

View File

@ -29,7 +29,7 @@ EPOCH_SIZE=$2
VOCAB_SIZE=$3 VOCAB_SIZE=$3
EMB_DIM=$4 EMB_DIM=$4
DATASET=$5 DATASET=$5
MINDSPORE_HCCL_CONFIG_PATH=$6 RANK_TABLE_FILE=$6
ENV_SH=$7 ENV_SH=$7
MODE=$8 MODE=$8
@ -39,7 +39,7 @@ do
passwd=$(get_node_passwd ${cluster_config_path} ${node}) passwd=$(get_node_passwd ${cluster_config_path} ${node})
echo "------------------${user}@${node}---------------------" echo "------------------${user}@${node}---------------------"
if [ $MODE == "host_device_mix" ]; then if [ $MODE == "host_device_mix" ]; then
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${MINDSPORE_HCCL_CONFIG_PATH}" ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${RANK_TABLE_FILE}"
else else
echo "[ERROR] mode is wrong" echo "[ERROR] mode is wrong"
exit 1 exit 1

View File

@ -140,7 +140,7 @@ def main():
env['RANK_ID'] = str(rank_id) env['RANK_ID'] = str(rank_id)
env['DEVICE_ID'] = str(device_id) env['DEVICE_ID'] = str(device_id)
if args.nproc_per_node > 1: if args.nproc_per_node > 1:
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn env['RANK_TABLE_FILE'] = table_fn
env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn
if os.path.exists(device_dir): if os.path.exists(device_dir):
shutil.rmtree(device_dir) shutil.rmtree(device_dir)

View File

@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py.
``` ```
# distributed training # distributed training
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM] Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]
``` ```

View File

@ -16,13 +16,13 @@
if [ $# != 3 ] if [ $# != 3 ]
then then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]" echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]"
exit 1 exit 1
fi fi
if [ ! -f $1 ] if [ ! -f $1 ]
then then
echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" echo "error: DRANK_TABLE_FILE=$1 is not a file"
exit 1 exit 1
fi fi
@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit
ulimit -u unlimited ulimit -u unlimited
export DEVICE_NUM=$3 export DEVICE_NUM=$3
export RANK_SIZE=$3 export RANK_SIZE=$3
export MINDSPORE_HCCL_CONFIG_PATH=$1 export RANK_TABLE_FILE=$1
for((i=0; i<${DEVICE_NUM}; i++)) for((i=0; i<${DEVICE_NUM}; i++))
do do