diff --git a/model_zoo/bert_thor/README.md b/model_zoo/bert_thor/README.md index 189b9d9e4d..a3df8b73bb 100644 --- a/model_zoo/bert_thor/README.md +++ b/model_zoo/bert_thor/README.md @@ -21,7 +21,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no - Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model. ``` bash - sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH + sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE ``` ## Usage diff --git a/model_zoo/bert_thor/scripts/run_distribute_pretrain.sh b/model_zoo/bert_thor/scripts/run_distribute_pretrain.sh index 217d757eb9..f82151bea0 100644 --- a/model_zoo/bert_thor/scripts/run_distribute_pretrain.sh +++ b/model_zoo/bert_thor/scripts/run_distribute_pretrain.sh @@ -16,7 +16,7 @@ echo "==============================================================================================================" echo "Please run the scipt as: " -echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH" +echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE" echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json" echo "It is better to use absolute path." echo "==============================================================================================================" @@ -26,7 +26,6 @@ DATA_DIR=$3 SCHEMA_DIR=$4 ulimit -u unlimited -export MINDSPORE_HCCL_CONFIG_PATH=$5 export RANK_TABLE_FILE=$5 export RANK_SIZE=$1 export HCCL_CONNECT_TIMEOUT=300 diff --git a/model_zoo/mobilenetv2/Readme.md b/model_zoo/mobilenetv2/Readme.md index 1687d2cbdc..b39013b07e 100644 --- a/model_zoo/mobilenetv2/Readme.md +++ b/model_zoo/mobilenetv2/Readme.md @@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/) ### Usage -- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH] +- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH] - GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] ### Launch diff --git a/model_zoo/mobilenetv2/scripts/run_train.sh b/model_zoo/mobilenetv2/scripts/run_train.sh index a6e2a79477..f20f155e21 100644 --- a/model_zoo/mobilenetv2/scripts/run_train.sh +++ b/model_zoo/mobilenetv2/scripts/run_train.sh @@ -30,7 +30,6 @@ run_ascend() BASEPATH=$(cd "`dirname $0`" || exit; pwd) export PYTHONPATH=${BASEPATH}:$PYTHONPATH - export MINDSPORE_HCCL_CONFIG_PATH=$4 export RANK_TABLE_FILE=$4 if [ -d "../train" ]; then @@ -81,7 +80,7 @@ run_gpu() if [ $# -gt 6 ] || [ $# -lt 4 ] then echo "Usage:\n \ - Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \ + Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]\n \ GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \ " exit 1 diff --git a/model_zoo/mobilenetv2_quant/src/launch.py b/model_zoo/mobilenetv2_quant/src/launch.py index 08477a363a..0d05ee9ad7 100644 --- a/model_zoo/mobilenetv2_quant/src/launch.py +++ b/model_zoo/mobilenetv2_quant/src/launch.py @@ -141,7 +141,6 @@ def main(): env['RANK_ID'] = str(rank_id) env['DEVICE_ID'] = str(device_id) if args.nproc_per_node > 1: - env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn env['RANK_TABLE_FILE'] = table_fn if os.path.exists(device_dir): shutil.rmtree(device_dir) diff --git a/model_zoo/mobilenetv3/src/launch.py b/model_zoo/mobilenetv3/src/launch.py index 48c8159664..aba74379f7 100644 --- a/model_zoo/mobilenetv3/src/launch.py +++ b/model_zoo/mobilenetv3/src/launch.py @@ -138,7 +138,7 @@ def main(): env['RANK_ID'] = str(rank_id) env['DEVICE_ID'] = str(device_id) if args.nproc_per_node > 1: - env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn + env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn if os.path.exists(device_dir): shutil.rmtree(device_dir) diff --git a/model_zoo/official/cv/deeplabv3/README.md b/model_zoo/official/cv/deeplabv3/README.md index 179793a3f7..fc98bbcfb5 100644 --- a/model_zoo/official/cv/deeplabv3/README.md +++ b/model_zoo/official/cv/deeplabv3/README.md @@ -25,7 +25,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo ``` - Run `run_distribute_train.sh` for distributed training. ``` bash - sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH + sh scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH ``` ### Evaluation Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path. diff --git a/model_zoo/official/cv/deeplabv3/scripts/run_distribute_train.sh b/model_zoo/official/cv/deeplabv3/scripts/run_distribute_train.sh index 4dcd8d9768..e59f803fed 100644 --- a/model_zoo/official/cv/deeplabv3/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/deeplabv3/scripts/run_distribute_train.sh @@ -16,14 +16,13 @@ echo "==============================================================================================================" echo "Please run the scipt as: " -echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH" -echo "for example: bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)" +echo "bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH" +echo "for example: bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH [PRETRAINED_CKPT_PATH](option)" echo "It is better to use absolute path." echo "==============================================================================================================" DATA_DIR=$2 - -export MINDSPORE_HCCL_CONFIG_PATH=$1 + export RANK_TABLE_FILE=$1 export RANK_SIZE=8 PATH_CHECKPOINT="" diff --git a/model_zoo/official/cv/faster_rcnn/README.md b/model_zoo/official/cv/faster_rcnn/README.md index 182e7df24b..9b29d4aabf 100644 --- a/model_zoo/official/cv/faster_rcnn/README.md +++ b/model_zoo/official/cv/faster_rcnn/README.md @@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr ``` # distributed training -sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] +sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] # standalone training sh run_standalone_train.sh [PRETRAINED_MODEL] ``` -> Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). +> Rank_table.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools). > As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned. #### Result diff --git a/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train.sh b/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train.sh index 4f01831d48..b790f32a53 100755 --- a/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/faster_rcnn/scripts/run_distribute_train.sh @@ -16,7 +16,7 @@ if [ $# -lt 1 ] || [ $# -gt 2 ] then - echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" + echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]" exit 1 fi @@ -33,7 +33,7 @@ echo $PATH1 if [ ! -f $PATH1 ] then - echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" exit 1 fi @@ -51,7 +51,6 @@ fi ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 -export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 export RANK_TABLE_FILE=$PATH1 for((i=0; i<${DEVICE_NUM}; i++)) diff --git a/model_zoo/official/cv/googlenet/scripts/run_train.sh b/model_zoo/official/cv/googlenet/scripts/run_train.sh index e8c045c8b1..ed8a0e5f2a 100644 --- a/model_zoo/official/cv/googlenet/scripts/run_train.sh +++ b/model_zoo/official/cv/googlenet/scripts/run_train.sh @@ -16,22 +16,22 @@ if [ $# != 1 ] then - echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]" + echo "Usage: sh run_train.sh [RANK_TABLE_FILE]" exit 1 fi if [ ! -f $1 ] then - echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" + echo "error: RANK_TABLE_FILE=$1 is not a file" exit 1 fi ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 -MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1) -export MINDSPORE_HCCL_CONFIG_PATH -echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}" +RANK_TABLE_FILE=$(realpath $1) +export RANK_TABLE_FILE +echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}" export SERVER_ID=0 rank_start=$((DEVICE_NUM * SERVER_ID)) diff --git a/model_zoo/official/cv/maskrcnn/README.md b/model_zoo/official/cv/maskrcnn/README.md index 65c66596a9..7e3704901e 100644 --- a/model_zoo/official/cv/maskrcnn/README.md +++ b/model_zoo/official/cv/maskrcnn/README.md @@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop ``` # distributed training -sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL] +sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] # standalone training sh run_standalone_train.sh [PRETRAINED_MODEL] diff --git a/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh b/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh index 6c04cd0b91..ab4a172f6e 100644 --- a/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh @@ -16,7 +16,7 @@ if [ $# != 2 ] then - echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]" + echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]" exit 1 fi @@ -35,7 +35,7 @@ echo $PATH2 if [ ! -f $PATH1 ] then - echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" exit 1 fi @@ -48,7 +48,6 @@ fi ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 -export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 export RANK_TABLE_FILE=$PATH1 echo 3 > /proc/sys/vm/drop_caches diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index ad93453602..943bafd4ac 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py. ``` # distributed training -Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] +Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional) # standalone training diff --git a/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh b/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh index efcb620cd8..0345910a7c 100755 --- a/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/resnet/scripts/run_distribute_train.sh @@ -16,7 +16,7 @@ if [ $# != 4 ] && [ $# != 5 ] then - echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" + echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)" exit 1 fi @@ -57,7 +57,7 @@ fi if [ ! -f $PATH1 ] then - echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file" + echo "error: RANK_TABLE_FILE=$PATH1 is not a file" exit 1 fi @@ -76,7 +76,6 @@ fi ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 -export MINDSPORE_HCCL_CONFIG_PATH=$PATH1 export RANK_TABLE_FILE=$PATH1 for((i=0; i<${DEVICE_NUM}; i++)) diff --git a/model_zoo/official/cv/resnext50/README.md b/model_zoo/official/cv/resnext50/README.md index aab2952c96..6119fd7913 100644 --- a/model_zoo/official/cv/resnext50/README.md +++ b/model_zoo/official/cv/resnext50/README.md @@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py ``` # distribute training example(8p) -sh run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH +sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH # standalone training sh run_standalone_train.sh DEVICE_ID DATA_PATH ``` @@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH ```bash # distributed training example(8p) for Ascend -sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH /dataset/train +sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train # standalone training example for Ascend sh scripts/run_standalone_train.sh 0 /dataset/train diff --git a/model_zoo/official/cv/ssd/scripts/run_distribute_train.sh b/model_zoo/official/cv/ssd/scripts/run_distribute_train.sh index 60eccf2c40..7175d22988 100644 --- a/model_zoo/official/cv/ssd/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/ssd/scripts/run_distribute_train.sh @@ -16,7 +16,7 @@ echo "==============================================================================================================" echo "Please run the scipt as: " -echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" +echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE" echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)" echo "It is better to use absolute path." echo "=================================================================================================================" @@ -24,7 +24,7 @@ echo "========================================================================== if [ $# != 5 ] && [ $# != 7 ] then echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \ -[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" +[RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)" exit 1 fi @@ -41,7 +41,7 @@ LR=$3 DATASET=$4 PRE_TRAINED=$6 PRE_TRAINED_EPOCH_SIZE=$7 -export MINDSPORE_HCCL_CONFIG_PATH=$5 +export RANK_TABLE_FILE=$5 for((i=0;i 1: - env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn + env['RANK_TABLE_FILE'] = table_fn env['RANK_TABLE_FILE'] = table_fn if os.path.exists(device_dir): shutil.rmtree(device_dir) diff --git a/model_zoo/resnet_thor/README.md b/model_zoo/resnet_thor/README.md index 5fb17007ae..cecd934575 100644 --- a/model_zoo/resnet_thor/README.md +++ b/model_zoo/resnet_thor/README.md @@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py. ``` # distributed training -Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM] +Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM] ``` diff --git a/model_zoo/resnet_thor/scripts/run_distribute_train.sh b/model_zoo/resnet_thor/scripts/run_distribute_train.sh index 6fa7457227..1fa72768ae 100644 --- a/model_zoo/resnet_thor/scripts/run_distribute_train.sh +++ b/model_zoo/resnet_thor/scripts/run_distribute_train.sh @@ -16,13 +16,13 @@ if [ $# != 3 ] then - echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]" + echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]" exit 1 fi if [ ! -f $1 ] then - echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file" + echo "error: DRANK_TABLE_FILE=$1 is not a file" exit 1 fi @@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit ulimit -u unlimited export DEVICE_NUM=$3 export RANK_SIZE=$3 -export MINDSPORE_HCCL_CONFIG_PATH=$1 +export RANK_TABLE_FILE=$1 for((i=0; i<${DEVICE_NUM}; i++)) do