forked from mindspore-Ecosystem/mindspore
!3670 remove old MINDSPORE_HCCL_CONFIG_PATH in model zoo
Merge pull request !3670 from panbingao/master
This commit is contained in:
commit
6eddd65cf1
|
@ -24,7 +24,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo
|
|||
```
|
||||
- Run `run_distribute_train.sh` for distributed training.
|
||||
``` bash
|
||||
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH
|
||||
sh scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
|
||||
```
|
||||
### Evaluation
|
||||
Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.
|
||||
|
|
|
@ -16,14 +16,13 @@
|
|||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH"
|
||||
echo "for example: bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)"
|
||||
echo "bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH"
|
||||
echo "for example: bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH [PRETRAINED_CKPT_PATH](option)"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
|
||||
DATA_DIR=$2
|
||||
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$1
|
||||
|
||||
export RANK_TABLE_FILE=$1
|
||||
export RANK_SIZE=8
|
||||
export DEVICE_NUM=8
|
||||
|
|
|
@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr
|
|||
|
||||
```
|
||||
# distributed training
|
||||
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL]
|
||||
sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
|
||||
|
||||
# standalone training
|
||||
sh run_standalone_train.sh [PRETRAINED_MODEL]
|
||||
```
|
||||
|
||||
> Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
|
||||
> Rank_table.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
|
||||
> As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned.
|
||||
|
||||
#### Result
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
if [ $# -lt 1 ] || [ $# -gt 2 ]
|
||||
then
|
||||
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]"
|
||||
echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -33,7 +33,7 @@ echo $PATH1
|
|||
|
||||
if [ ! -f $PATH1 ]
|
||||
then
|
||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
||||
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -51,7 +51,6 @@ fi
|
|||
ulimit -u unlimited
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
||||
export RANK_TABLE_FILE=$PATH1
|
||||
|
||||
for((i=0; i<${DEVICE_NUM}; i++))
|
||||
|
|
|
@ -16,22 +16,22 @@
|
|||
|
||||
if [ $# != 1 ]
|
||||
then
|
||||
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]"
|
||||
echo "Usage: sh run_train.sh [RANK_TABLE_FILE]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f $1 ]
|
||||
then
|
||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
|
||||
echo "error: RANK_TABLE_FILE=$1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
ulimit -u unlimited
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1)
|
||||
export MINDSPORE_HCCL_CONFIG_PATH
|
||||
echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}"
|
||||
RANK_TABLE_FILE=$(realpath $1)
|
||||
export RANK_TABLE_FILE
|
||||
echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
|
||||
|
||||
export SERVER_ID=0
|
||||
rank_start=$((DEVICE_NUM * SERVER_ID))
|
||||
|
|
|
@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop
|
|||
|
||||
```
|
||||
# distributed training
|
||||
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL]
|
||||
sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
|
||||
|
||||
# standalone training
|
||||
sh run_standalone_train.sh [PRETRAINED_MODEL]
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
if [ $# != 2 ]
|
||||
then
|
||||
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]"
|
||||
echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -35,7 +35,7 @@ echo $PATH2
|
|||
|
||||
if [ ! -f $PATH1 ]
|
||||
then
|
||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
||||
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -48,7 +48,6 @@ fi
|
|||
ulimit -u unlimited
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
||||
export RANK_TABLE_FILE=$PATH1
|
||||
|
||||
echo 3 > /proc/sys/vm/drop_caches
|
||||
|
|
|
@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
|
|||
|
||||
### Usage
|
||||
|
||||
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]
|
||||
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]
|
||||
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
|
||||
|
||||
### Launch
|
||||
|
|
|
@ -30,7 +30,6 @@ run_ascend()
|
|||
|
||||
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
||||
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
||||
export RANK_TABLE_FILE=$4
|
||||
if [ -d "../train" ];
|
||||
then
|
||||
|
@ -81,7 +80,7 @@ run_gpu()
|
|||
if [ $# -gt 6 ] || [ $# -lt 4 ]
|
||||
then
|
||||
echo "Usage:\n \
|
||||
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \
|
||||
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]\n \
|
||||
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
|
||||
"
|
||||
exit 1
|
||||
|
|
|
@ -141,7 +141,6 @@ def main():
|
|||
env['RANK_ID'] = str(rank_id)
|
||||
env['DEVICE_ID'] = str(device_id)
|
||||
if args.nproc_per_node > 1:
|
||||
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
|
||||
env['RANK_TABLE_FILE'] = table_fn
|
||||
if os.path.exists(device_dir):
|
||||
shutil.rmtree(device_dir)
|
||||
|
|
|
@ -138,7 +138,7 @@ def main():
|
|||
env['RANK_ID'] = str(rank_id)
|
||||
env['DEVICE_ID'] = str(device_id)
|
||||
if args.nproc_per_node > 1:
|
||||
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
|
||||
env['RANK_TABLE_FILE'] = table_fn
|
||||
env['RANK_TABLE_FILE'] = table_fn
|
||||
if os.path.exists(device_dir):
|
||||
shutil.rmtree(device_dir)
|
||||
|
|
|
@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
|
||||
```
|
||||
# distributed training
|
||||
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
|
||||
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH]
|
||||
[PRETRAINED_CKPT_PATH](optional)
|
||||
|
||||
# standalone training
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
if [ $# != 4 ] && [ $# != 5 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -57,7 +57,7 @@ fi
|
|||
|
||||
if [ ! -f $PATH1 ]
|
||||
then
|
||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
||||
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -76,7 +76,6 @@ fi
|
|||
ulimit -u unlimited
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
||||
export RANK_TABLE_FILE=$PATH1
|
||||
|
||||
export SERVER_ID=0
|
||||
|
|
|
@ -140,7 +140,7 @@ def main():
|
|||
env['RANK_ID'] = str(rank_id)
|
||||
env['DEVICE_ID'] = str(device_id)
|
||||
if args.nproc_per_node > 1:
|
||||
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
|
||||
env['RANK_TABLE_FILE'] = table_fn
|
||||
env['RANK_TABLE_FILE'] = table_fn
|
||||
if os.path.exists(device_dir):
|
||||
shutil.rmtree(device_dir)
|
||||
|
|
|
@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py.
|
|||
|
||||
```
|
||||
# distributed training
|
||||
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]
|
||||
Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]
|
||||
```
|
||||
|
||||
|
||||
|
|
|
@ -16,13 +16,13 @@
|
|||
|
||||
if [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]"
|
||||
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f $1 ]
|
||||
then
|
||||
echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
|
||||
echo "error: DRANK_TABLE_FILE=$1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit
|
|||
ulimit -u unlimited
|
||||
export DEVICE_NUM=$3
|
||||
export RANK_SIZE=$3
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$1
|
||||
export RANK_TABLE_FILE=$1
|
||||
|
||||
for((i=0; i<${DEVICE_NUM}; i++))
|
||||
do
|
||||
|
|
|
@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py
|
|||
|
||||
```
|
||||
# distribute training example(8p)
|
||||
sh run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH
|
||||
sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
|
||||
# standalone training
|
||||
sh run_standalone_train.sh DEVICE_ID DATA_PATH
|
||||
```
|
||||
|
@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH
|
|||
|
||||
```bash
|
||||
# distributed training example(8p) for Ascend
|
||||
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH /dataset/train
|
||||
sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
|
||||
# standalone training example for Ascend
|
||||
sh scripts/run_standalone_train.sh 0 /dataset/train
|
||||
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||
echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
|
||||
echo "It is better to use absolute path."
|
||||
echo "================================================================================================================="
|
||||
|
@ -24,7 +24,7 @@ echo "==========================================================================
|
|||
if [ $# != 5 ] && [ $# != 7 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
|
||||
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||
[RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -41,7 +41,7 @@ LR=$3
|
|||
DATASET=$4
|
||||
PRE_TRAINED=$6
|
||||
PRE_TRAINED_EPOCH_SIZE=$7
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$5
|
||||
export RANK_TABLE_FILE=$5
|
||||
|
||||
for((i=0;i<RANK_SIZE;i++))
|
||||
do
|
||||
|
|
|
@ -209,10 +209,10 @@ parameters/options:
|
|||
- Train on Ascend.
|
||||
|
||||
```
|
||||
Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]
|
||||
Usage: sh script/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
|
||||
|
||||
parameters/options:
|
||||
MINDSPORE_HCCL_CONFIG_PATH HCCL configuration file path.
|
||||
RANK_TABLE_FILE HCCL configuration file path.
|
||||
DATA_PATH the storage path of dataset.
|
||||
```
|
||||
|
||||
|
|
|
@ -16,13 +16,13 @@
|
|||
|
||||
if [ $# != 2 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]"
|
||||
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f $1 ]
|
||||
then
|
||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
|
||||
echo "error: RANK_TABLE_FILEH=$1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -34,7 +34,7 @@ fi
|
|||
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$1
|
||||
export RANK_TABLE_FILE=$1
|
||||
|
||||
for((i=0;i<RANK_SIZE;i++))
|
||||
do
|
||||
|
|
|
@ -77,7 +77,7 @@ Parameters for both training and evaluation can be set in config.py.
|
|||
|
||||
```
|
||||
# distributed training in Ascend
|
||||
Usage: bash run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
|
||||
Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]
|
||||
|
||||
# distributed training in GPU
|
||||
Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH]
|
||||
|
|
|
@ -15,7 +15,7 @@
|
|||
# ============================================================================
|
||||
|
||||
if [ $# != 2 ]; then
|
||||
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]"
|
||||
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -31,7 +31,7 @@ PATH1=$(get_real_path $1)
|
|||
PATH2=$(get_real_path $2)
|
||||
|
||||
if [ ! -f $PATH1 ]; then
|
||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
||||
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -43,7 +43,6 @@ fi
|
|||
ulimit -u unlimited
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
||||
export RANK_TABLE_FILE=$PATH1
|
||||
|
||||
for ((i = 0; i < ${DEVICE_NUM}; i++)); do
|
||||
|
|
|
@ -55,7 +55,7 @@ This is an example of training YOLOV3-DarkNet53 with COCO2014 dataset in MindSpo
|
|||
|
||||
```
|
||||
# distributed training
|
||||
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]
|
||||
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]
|
||||
|
||||
# standalone training
|
||||
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE]
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
if [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]"
|
||||
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -30,10 +30,10 @@ get_real_path(){
|
|||
|
||||
DATASET_PATH=$(get_real_path $1)
|
||||
PRETRAINED_BACKBONE=$(get_real_path $2)
|
||||
MINDSPORE_HCCL_CONFIG_PATH=$(get_real_path $3)
|
||||
RANK_TABLE_FILE=$(get_real_path $3)
|
||||
echo $DATASET_PATH
|
||||
echo $PRETRAINED_BACKBONE
|
||||
echo $MINDSPORE_HCCL_CONFIG_PATH
|
||||
echo $RANK_TABLE_FILE
|
||||
|
||||
if [ ! -d $DATASET_PATH ]
|
||||
then
|
||||
|
@ -47,15 +47,15 @@ then
|
|||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -f $MINDSPORE_HCCL_CONFIG_PATH ]
|
||||
if [ ! -f $RANK_TABLE_FILE ]
|
||||
then
|
||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH is not a file"
|
||||
echo "error: RANK_TABLE_FILE=$RANK_TABLE_FILE is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export DEVICE_NUM=8
|
||||
export RANK_SIZE=8
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH
|
||||
export RANK_TABLE_FILEH=$RANK_TABLE_FILE
|
||||
|
||||
for((i=0; i<${DEVICE_NUM}; i++))
|
||||
do
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
echo "======================================================================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
|
||||
echo "It is better to use absolute path."
|
||||
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
|
||||
|
@ -24,7 +24,7 @@ echo "==========================================================================
|
|||
|
||||
if [ $# != 6 ] && [ $# != 8 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \
|
||||
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [RANK_TABLE_FILE] \
|
||||
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
@ -42,7 +42,7 @@ python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image
|
|||
|
||||
echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
|
||||
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$6
|
||||
export RANK_TABLE_FILE=$6
|
||||
export RANK_SIZE=$1
|
||||
|
||||
BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
|
||||
|
|
|
@ -21,7 +21,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no
|
|||
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
|
||||
|
||||
``` bash
|
||||
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
|
||||
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE
|
||||
```
|
||||
|
||||
## Usage
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
|
||||
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE"
|
||||
echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
|
@ -26,7 +26,6 @@ DATA_DIR=$3
|
|||
SCHEMA_DIR=$4
|
||||
|
||||
ulimit -u unlimited
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$5
|
||||
export RANK_TABLE_FILE=$5
|
||||
export RANK_SIZE=$1
|
||||
export HCCL_CONNECT_TIMEOUT=300
|
||||
|
|
|
@ -44,7 +44,6 @@ set_hccl_json()
|
|||
do
|
||||
if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
|
||||
then
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$2
|
||||
export RANK_TABLE_FILE=$2
|
||||
break
|
||||
fi
|
||||
|
|
|
@ -74,7 +74,7 @@ This example implements training and evaluation of Transformer Model, which is i
|
|||
- Run `run_distribute_train.sh` for distributed training of Transformer model.
|
||||
|
||||
``` bash
|
||||
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH
|
||||
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE
|
||||
```
|
||||
|
||||
### Evaluation
|
||||
|
|
|
@ -16,7 +16,7 @@
|
|||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH"
|
||||
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
|
||||
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
|
||||
echo "It is better to use absolute path."
|
||||
echo "=============================================================================================================="
|
||||
|
@ -28,7 +28,6 @@ cd run_distribute_train || exit
|
|||
EPOCH_SIZE=$2
|
||||
DATA_PATH=$3
|
||||
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
||||
export RANK_TABLE_FILE=$4
|
||||
export RANK_SIZE=$1
|
||||
export HCCL_FLAG=1
|
||||
|
|
|
@ -21,7 +21,7 @@ echo "After running the script, the network runs in the background, The log will
|
|||
|
||||
export RANK_SIZE=$1
|
||||
DATA_URL=$2
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$3
|
||||
export RANK_TABLE_FILE=$3
|
||||
|
||||
for ((i=0; i<RANK_SIZE;i++))
|
||||
do
|
||||
|
|
|
@ -22,7 +22,6 @@ export RANK_SIZE=$1
|
|||
export EPOCH_SIZE=$2
|
||||
export DATASET=$3
|
||||
export RANK_TABLE_FILE=$4
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
||||
|
||||
for((i=0;i<$RANK_SIZE;i++));
|
||||
do
|
||||
|
|
|
@ -22,7 +22,6 @@ export RANK_SIZE=$1
|
|||
export EPOCH_SIZE=$2
|
||||
export DATASET=$3
|
||||
export RANK_TABLE_FILE=$4
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
||||
|
||||
for((i=0;i<$RANK_SIZE;i++));
|
||||
do
|
||||
|
|
|
@ -21,7 +21,6 @@ export RANK_SIZE=$1
|
|||
export EPOCH_SIZE=$2
|
||||
export DATASET=$3
|
||||
export RANK_TABLE_FILE=$4
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
||||
|
||||
export MS_COMM_TYPE=zmq
|
||||
export MS_SCHED_NUM=1
|
||||
|
|
|
@ -29,7 +29,7 @@ EPOCH_SIZE=$2
|
|||
VOCAB_SIZE=$3
|
||||
EMB_DIM=$4
|
||||
DATASET=$5
|
||||
MINDSPORE_HCCL_CONFIG_PATH=$6
|
||||
RANK_TABLE_FILE=$6
|
||||
ENV_SH=$7
|
||||
MODE=$8
|
||||
|
||||
|
@ -39,7 +39,7 @@ do
|
|||
passwd=$(get_node_passwd ${cluster_config_path} ${node})
|
||||
echo "------------------${user}@${node}---------------------"
|
||||
if [ $MODE == "host_device_mix" ]; then
|
||||
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${MINDSPORE_HCCL_CONFIG_PATH}"
|
||||
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${RANK_TABLE_FILE}"
|
||||
else
|
||||
echo "[ERROR] mode is wrong"
|
||||
exit 1
|
||||
|
|
Loading…
Reference in New Issue