forked from mindspore-Ecosystem/mindspore
remove old MINDSPORE_HCCL_CONFIG_PATH in model zoo
This commit is contained in:
parent
1b69923472
commit
3e82ae7f51
|
@ -21,7 +21,7 @@ This is an example of training bert by second-order optimizer THOR. THOR is a no
|
||||||
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
|
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
|
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE
|
||||||
```
|
```
|
||||||
|
|
||||||
## Usage
|
## Usage
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
|
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR RANK_TABLE_FILE"
|
||||||
echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
|
echo "for example: bash run_distribute_pretrain.sh 8 1 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
|
@ -26,7 +26,6 @@ DATA_DIR=$3
|
||||||
SCHEMA_DIR=$4
|
SCHEMA_DIR=$4
|
||||||
|
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$5
|
|
||||||
export RANK_TABLE_FILE=$5
|
export RANK_TABLE_FILE=$5
|
||||||
export RANK_SIZE=$1
|
export RANK_SIZE=$1
|
||||||
export HCCL_CONNECT_TIMEOUT=300
|
export HCCL_CONNECT_TIMEOUT=300
|
||||||
|
|
|
@ -60,7 +60,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
|
||||||
|
|
||||||
### Usage
|
### Usage
|
||||||
|
|
||||||
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]
|
- Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]
|
||||||
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
|
- GPU: sh run_trian.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
|
||||||
|
|
||||||
### Launch
|
### Launch
|
||||||
|
|
|
@ -30,7 +30,6 @@ run_ascend()
|
||||||
|
|
||||||
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
||||||
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
|
export PYTHONPATH=${BASEPATH}:$PYTHONPATH
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
|
||||||
export RANK_TABLE_FILE=$4
|
export RANK_TABLE_FILE=$4
|
||||||
if [ -d "../train" ];
|
if [ -d "../train" ];
|
||||||
then
|
then
|
||||||
|
@ -81,7 +80,7 @@ run_gpu()
|
||||||
if [ $# -gt 6 ] || [ $# -lt 4 ]
|
if [ $# -gt 6 ] || [ $# -lt 4 ]
|
||||||
then
|
then
|
||||||
echo "Usage:\n \
|
echo "Usage:\n \
|
||||||
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [CKPT_PATH]\n \
|
Ascend: sh run_train.sh Ascend [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [RANK_TABLE_FILE] [DATASET_PATH] [CKPT_PATH]\n \
|
||||||
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
|
GPU: sh run_train.sh GPU [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]\n \
|
||||||
"
|
"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -141,7 +141,6 @@ def main():
|
||||||
env['RANK_ID'] = str(rank_id)
|
env['RANK_ID'] = str(rank_id)
|
||||||
env['DEVICE_ID'] = str(device_id)
|
env['DEVICE_ID'] = str(device_id)
|
||||||
if args.nproc_per_node > 1:
|
if args.nproc_per_node > 1:
|
||||||
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
|
|
||||||
env['RANK_TABLE_FILE'] = table_fn
|
env['RANK_TABLE_FILE'] = table_fn
|
||||||
if os.path.exists(device_dir):
|
if os.path.exists(device_dir):
|
||||||
shutil.rmtree(device_dir)
|
shutil.rmtree(device_dir)
|
||||||
|
|
|
@ -138,7 +138,7 @@ def main():
|
||||||
env['RANK_ID'] = str(rank_id)
|
env['RANK_ID'] = str(rank_id)
|
||||||
env['DEVICE_ID'] = str(device_id)
|
env['DEVICE_ID'] = str(device_id)
|
||||||
if args.nproc_per_node > 1:
|
if args.nproc_per_node > 1:
|
||||||
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
|
env['RANK_TABLE_FILE'] = table_fn
|
||||||
env['RANK_TABLE_FILE'] = table_fn
|
env['RANK_TABLE_FILE'] = table_fn
|
||||||
if os.path.exists(device_dir):
|
if os.path.exists(device_dir):
|
||||||
shutil.rmtree(device_dir)
|
shutil.rmtree(device_dir)
|
||||||
|
|
|
@ -25,7 +25,7 @@ This is an example of training DeepLabV3 with PASCAL VOC 2012 dataset in MindSpo
|
||||||
```
|
```
|
||||||
- Run `run_distribute_train.sh` for distributed training.
|
- Run `run_distribute_train.sh` for distributed training.
|
||||||
``` bash
|
``` bash
|
||||||
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH
|
sh scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
|
||||||
```
|
```
|
||||||
### Evaluation
|
### Evaluation
|
||||||
Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.
|
Set options in evaluation_config.py. Make sure the 'data_file' and 'finetune_ckpt' are set to your own path.
|
||||||
|
|
|
@ -16,14 +16,13 @@
|
||||||
|
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH"
|
echo "bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH"
|
||||||
echo "for example: bash run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH [PRETRAINED_CKPT_PATH](option)"
|
echo "for example: bash run_distribute_train.sh RANK_TABLE_FILE DATA_PATH [PRETRAINED_CKPT_PATH](option)"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
|
|
||||||
DATA_DIR=$2
|
DATA_DIR=$2
|
||||||
|
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$1
|
|
||||||
export RANK_TABLE_FILE=$1
|
export RANK_TABLE_FILE=$1
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
PATH_CHECKPOINT=""
|
PATH_CHECKPOINT=""
|
||||||
|
|
|
@ -87,13 +87,13 @@ FasterRcnn is a two-stage target detection network,This network uses a region pr
|
||||||
|
|
||||||
```
|
```
|
||||||
# distributed training
|
# distributed training
|
||||||
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL]
|
sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
|
||||||
|
|
||||||
# standalone training
|
# standalone training
|
||||||
sh run_standalone_train.sh [PRETRAINED_MODEL]
|
sh run_standalone_train.sh [PRETRAINED_MODEL]
|
||||||
```
|
```
|
||||||
|
|
||||||
> Rank_table.json which is specified by MINDSPORE_HCCL_CONFIG_PATH is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
|
> Rank_table.json which is specified by RANK_TABLE_FILE is needed when you are running a distribute task. You can generate it by using the [hccl_tools](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools).
|
||||||
> As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned.
|
> As for PRETRAINED_MODEL,if not set, the model will be trained from the very beginning.Ready-made pretrained_models are not available now. Stay tuned.
|
||||||
|
|
||||||
#### Result
|
#### Result
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
if [ $# -lt 1 ] || [ $# -gt 2 ]
|
if [ $# -lt 1 ] || [ $# -gt 2 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]"
|
echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -33,7 +33,7 @@ echo $PATH1
|
||||||
|
|
||||||
if [ ! -f $PATH1 ]
|
if [ ! -f $PATH1 ]
|
||||||
then
|
then
|
||||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -51,7 +51,6 @@ fi
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
|
||||||
export RANK_TABLE_FILE=$PATH1
|
export RANK_TABLE_FILE=$PATH1
|
||||||
|
|
||||||
for((i=0; i<${DEVICE_NUM}; i++))
|
for((i=0; i<${DEVICE_NUM}; i++))
|
||||||
|
|
|
@ -16,22 +16,22 @@
|
||||||
|
|
||||||
if [ $# != 1 ]
|
if [ $# != 1 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH]"
|
echo "Usage: sh run_train.sh [RANK_TABLE_FILE]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $1 ]
|
if [ ! -f $1 ]
|
||||||
then
|
then
|
||||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
|
echo "error: RANK_TABLE_FILE=$1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
MINDSPORE_HCCL_CONFIG_PATH=$(realpath $1)
|
RANK_TABLE_FILE=$(realpath $1)
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH
|
export RANK_TABLE_FILE
|
||||||
echo "MINDSPORE_HCCL_CONFIG_PATH=${MINDSPORE_HCCL_CONFIG_PATH}"
|
echo "RANK_TABLE_FILE=${RANK_TABLE_FILE}"
|
||||||
|
|
||||||
export SERVER_ID=0
|
export SERVER_ID=0
|
||||||
rank_start=$((DEVICE_NUM * SERVER_ID))
|
rank_start=$((DEVICE_NUM * SERVER_ID))
|
||||||
|
|
|
@ -88,7 +88,7 @@ MaskRcnn is a two-stage target detection network,This network uses a region prop
|
||||||
|
|
||||||
```
|
```
|
||||||
# distributed training
|
# distributed training
|
||||||
sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_MODEL]
|
sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
|
||||||
|
|
||||||
# standalone training
|
# standalone training
|
||||||
sh run_standalone_train.sh [PRETRAINED_MODEL]
|
sh run_standalone_train.sh [PRETRAINED_MODEL]
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
if [ $# != 2 ]
|
if [ $# != 2 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [PRETRAINED_PATH]"
|
echo "Usage: sh run_train.sh [RANK_TABLE_FILE] [PRETRAINED_PATH]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -35,7 +35,7 @@ echo $PATH2
|
||||||
|
|
||||||
if [ ! -f $PATH1 ]
|
if [ ! -f $PATH1 ]
|
||||||
then
|
then
|
||||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -48,7 +48,6 @@ fi
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
|
||||||
export RANK_TABLE_FILE=$PATH1
|
export RANK_TABLE_FILE=$PATH1
|
||||||
|
|
||||||
echo 3 > /proc/sys/vm/drop_caches
|
echo 3 > /proc/sys/vm/drop_caches
|
||||||
|
|
|
@ -132,7 +132,7 @@ Parameters for both training and evaluation can be set in config.py.
|
||||||
|
|
||||||
```
|
```
|
||||||
# distributed training
|
# distributed training
|
||||||
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
|
Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH]
|
||||||
[PRETRAINED_CKPT_PATH](optional)
|
[PRETRAINED_CKPT_PATH](optional)
|
||||||
|
|
||||||
# standalone training
|
# standalone training
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
if [ $# != 4 ] && [ $# != 5 ]
|
if [ $# != 4 ] && [ $# != 5 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
echo "Usage: sh run_distribute_train.sh [resnet50|resnet101] [cifar10|imagenet2012] [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -57,7 +57,7 @@ fi
|
||||||
|
|
||||||
if [ ! -f $PATH1 ]
|
if [ ! -f $PATH1 ]
|
||||||
then
|
then
|
||||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -76,7 +76,6 @@ fi
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
|
||||||
export RANK_TABLE_FILE=$PATH1
|
export RANK_TABLE_FILE=$PATH1
|
||||||
|
|
||||||
for((i=0; i<${DEVICE_NUM}; i++))
|
for((i=0; i<${DEVICE_NUM}; i++))
|
||||||
|
|
|
@ -82,7 +82,7 @@ Parameters for both training and evaluating can be set in config.py
|
||||||
|
|
||||||
```
|
```
|
||||||
# distribute training example(8p)
|
# distribute training example(8p)
|
||||||
sh run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH DATA_PATH
|
sh run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
|
||||||
# standalone training
|
# standalone training
|
||||||
sh run_standalone_train.sh DEVICE_ID DATA_PATH
|
sh run_standalone_train.sh DEVICE_ID DATA_PATH
|
||||||
```
|
```
|
||||||
|
@ -91,7 +91,7 @@ sh run_standalone_train.sh DEVICE_ID DATA_PATH
|
||||||
|
|
||||||
```bash
|
```bash
|
||||||
# distributed training example(8p) for Ascend
|
# distributed training example(8p) for Ascend
|
||||||
sh scripts/run_distribute_train.sh MINDSPORE_HCCL_CONFIG_PATH /dataset/train
|
sh scripts/run_distribute_train.sh RANK_TABLE_FILE /dataset/train
|
||||||
# standalone training example for Ascend
|
# standalone training example for Ascend
|
||||||
sh scripts/run_standalone_train.sh 0 /dataset/train
|
sh scripts/run_standalone_train.sh 0 /dataset/train
|
||||||
|
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE LR DATASET RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||||
echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
|
echo "for example: sh run_distribute_train.sh 8 500 0.2 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "================================================================================================================="
|
echo "================================================================================================================="
|
||||||
|
@ -24,7 +24,7 @@ echo "==========================================================================
|
||||||
if [ $# != 5 ] && [ $# != 7 ]
|
if [ $# != 5 ] && [ $# != 7 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
|
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [LR] [DATASET] \
|
||||||
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
[RANK_TABLE_FILE] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -41,7 +41,7 @@ LR=$3
|
||||||
DATASET=$4
|
DATASET=$4
|
||||||
PRE_TRAINED=$6
|
PRE_TRAINED=$6
|
||||||
PRE_TRAINED_EPOCH_SIZE=$7
|
PRE_TRAINED_EPOCH_SIZE=$7
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$5
|
export RANK_TABLE_FILE=$5
|
||||||
|
|
||||||
for((i=0;i<RANK_SIZE;i++))
|
for((i=0;i<RANK_SIZE;i++))
|
||||||
do
|
do
|
||||||
|
|
|
@ -101,9 +101,9 @@ parameters/options:
|
||||||
### Distribute Training
|
### Distribute Training
|
||||||
|
|
||||||
```
|
```
|
||||||
Usage: sh script/run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]
|
Usage: sh script/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]
|
||||||
|
|
||||||
parameters/options:
|
parameters/options:
|
||||||
MINDSPORE_HCCL_CONFIG_PATH HCCL configuration file path.
|
RANK_TABLE_FILE HCCL configuration file path.
|
||||||
DATA_PATH the storage path of dataset.
|
DATA_PATH the storage path of dataset.
|
||||||
```
|
```
|
||||||
|
|
|
@ -16,13 +16,13 @@
|
||||||
|
|
||||||
if [ $# != 2 ]
|
if [ $# != 2 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]"
|
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATA_PATH]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $1 ]
|
if [ ! -f $1 ]
|
||||||
then
|
then
|
||||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
|
echo "error: RANK_TABLE_FILEH=$1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -34,7 +34,7 @@ fi
|
||||||
|
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$1
|
export RANK_TABLE_FILE=$1
|
||||||
|
|
||||||
for((i=0;i<RANK_SIZE;i++))
|
for((i=0;i<RANK_SIZE;i++))
|
||||||
do
|
do
|
||||||
|
|
|
@ -77,7 +77,7 @@ Parameters for both training and evaluation can be set in config.py.
|
||||||
|
|
||||||
```
|
```
|
||||||
# distributed training in Ascend
|
# distributed training in Ascend
|
||||||
Usage: bash run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
|
Usage: bash run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]
|
||||||
|
|
||||||
# distributed training in GPU
|
# distributed training in GPU
|
||||||
Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH]
|
Usage: bash run_distribute_train_for_gpu.sh [RANK_SIZE] [DATASET_PATH]
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
if [ $# != 2 ]; then
|
if [ $# != 2 ]; then
|
||||||
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]"
|
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -31,7 +31,7 @@ PATH1=$(get_real_path $1)
|
||||||
PATH2=$(get_real_path $2)
|
PATH2=$(get_real_path $2)
|
||||||
|
|
||||||
if [ ! -f $PATH1 ]; then
|
if [ ! -f $PATH1 ]; then
|
||||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$PATH1 is not a file"
|
echo "error: RANK_TABLE_FILE=$PATH1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -43,7 +43,6 @@ fi
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$PATH1
|
|
||||||
export RANK_TABLE_FILE=$PATH1
|
export RANK_TABLE_FILE=$PATH1
|
||||||
|
|
||||||
for ((i = 0; i < ${DEVICE_NUM}; i++)); do
|
for ((i = 0; i < ${DEVICE_NUM}; i++)); do
|
||||||
|
|
|
@ -55,7 +55,7 @@ This is an example of training YOLOV3-DarkNet53 with COCO2014 dataset in MindSpo
|
||||||
|
|
||||||
```
|
```
|
||||||
# distributed training
|
# distributed training
|
||||||
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]
|
sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]
|
||||||
|
|
||||||
# standalone training
|
# standalone training
|
||||||
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE]
|
sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE]
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
if [ $# != 3 ]
|
if [ $# != 3 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [MINDSPORE_HCCL_CONFIG_PATH]"
|
echo "Usage: sh run_distribute_train.sh [DATASET_PATH] [PRETRAINED_BACKBONE] [RANK_TABLE_FILE]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -30,10 +30,10 @@ get_real_path(){
|
||||||
|
|
||||||
DATASET_PATH=$(get_real_path $1)
|
DATASET_PATH=$(get_real_path $1)
|
||||||
PRETRAINED_BACKBONE=$(get_real_path $2)
|
PRETRAINED_BACKBONE=$(get_real_path $2)
|
||||||
MINDSPORE_HCCL_CONFIG_PATH=$(get_real_path $3)
|
RANK_TABLE_FILE=$(get_real_path $3)
|
||||||
echo $DATASET_PATH
|
echo $DATASET_PATH
|
||||||
echo $PRETRAINED_BACKBONE
|
echo $PRETRAINED_BACKBONE
|
||||||
echo $MINDSPORE_HCCL_CONFIG_PATH
|
echo $RANK_TABLE_FILE
|
||||||
|
|
||||||
if [ ! -d $DATASET_PATH ]
|
if [ ! -d $DATASET_PATH ]
|
||||||
then
|
then
|
||||||
|
@ -47,15 +47,15 @@ then
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $MINDSPORE_HCCL_CONFIG_PATH ]
|
if [ ! -f $RANK_TABLE_FILE ]
|
||||||
then
|
then
|
||||||
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH is not a file"
|
echo "error: RANK_TABLE_FILE=$RANK_TABLE_FILE is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
export DEVICE_NUM=8
|
export DEVICE_NUM=8
|
||||||
export RANK_SIZE=8
|
export RANK_SIZE=8
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$MINDSPORE_HCCL_CONFIG_PATH
|
export RANK_TABLE_FILEH=$RANK_TABLE_FILE
|
||||||
|
|
||||||
for((i=0; i<${DEVICE_NUM}; i++))
|
for((i=0; i<${DEVICE_NUM}; i++))
|
||||||
do
|
do
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
echo "======================================================================================================================================================="
|
echo "======================================================================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH RANK_TABLE_FILE PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||||
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
|
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
|
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
|
||||||
|
@ -24,7 +24,7 @@ echo "==========================================================================
|
||||||
|
|
||||||
if [ $# != 6 ] && [ $# != 8 ]
|
if [ $# != 6 ] && [ $# != 8 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \
|
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [RANK_TABLE_FILE] \
|
||||||
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
@ -42,7 +42,7 @@ python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image
|
||||||
|
|
||||||
echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
|
echo "After running the scipt, the network runs in the background. The log will be generated in LOGx/log.txt"
|
||||||
|
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$6
|
export RANK_TABLE_FILE=$6
|
||||||
export RANK_SIZE=$1
|
export RANK_SIZE=$1
|
||||||
|
|
||||||
BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
|
BASE_PATH=$(cd "`dirname $0`" || exit; pwd)
|
||||||
|
|
|
@ -21,7 +21,7 @@ This example implements pre-training, fine-tuning and evaluation of [BERT-base](
|
||||||
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
|
- Run `run_distribute_pretrain.sh` for distributed pre-training of BERT-base and BERT-NEZHA model.
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
sh scripts/run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH
|
sh scripts/run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH
|
||||||
```
|
```
|
||||||
|
|
||||||
### Fine-Tuning and Evaluation
|
### Fine-Tuning and Evaluation
|
||||||
|
|
|
@ -16,9 +16,11 @@
|
||||||
|
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "bash run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_DIR SCHEMA_DIR MINDSPORE_HCCL_CONFIG_PATH"
|
echo "bash run_distribute_pretrain.sh DATA_DIR MINDSPORE_HCCL_CONFIG_PATH"
|
||||||
echo "for example: bash run_distribute_pretrain.sh 8 40 /path/zh-wiki/ /path/Schema.json /path/hccl.json"
|
echo "for example: bash run_distribute_pretrain.sh /path/dataset /path/hccl.json"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
|
echo "For hyper parameter, please note that you should customize the scripts:
|
||||||
|
'{CUR_DIR}/scripts/ascend_distributed_launcher/hyper_parameter_config.ini' "
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
|
|
||||||
EPOCH_SIZE=$2
|
EPOCH_SIZE=$2
|
||||||
|
|
|
@ -44,7 +44,6 @@ set_hccl_json()
|
||||||
do
|
do
|
||||||
if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
|
if [[ "$1" == "-j" || "$1" == "--hccl_json" ]]
|
||||||
then
|
then
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$2
|
|
||||||
export RANK_TABLE_FILE=$2
|
export RANK_TABLE_FILE=$2
|
||||||
break
|
break
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -74,7 +74,7 @@ This example implements training and evaluation of Transformer Model, which is i
|
||||||
- Run `run_distribute_train.sh` for distributed training of Transformer model.
|
- Run `run_distribute_train.sh` for distributed training of Transformer model.
|
||||||
|
|
||||||
``` bash
|
``` bash
|
||||||
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH
|
sh scripts/run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE
|
||||||
```
|
```
|
||||||
|
|
||||||
### Evaluation
|
### Evaluation
|
||||||
|
|
|
@ -16,7 +16,7 @@
|
||||||
|
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
echo "Please run the scipt as: "
|
echo "Please run the scipt as: "
|
||||||
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH MINDSPORE_HCCL_CONFIG_PATH"
|
echo "sh run_distribute_pretrain.sh DEVICE_NUM EPOCH_SIZE DATA_PATH RANK_TABLE_FILE"
|
||||||
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
|
echo "for example: sh run_distribute_pretrain.sh 8 52 /path/ende-l128-mindrecord00 /path/hccl.json"
|
||||||
echo "It is better to use absolute path."
|
echo "It is better to use absolute path."
|
||||||
echo "=============================================================================================================="
|
echo "=============================================================================================================="
|
||||||
|
@ -28,7 +28,6 @@ cd run_distribute_train || exit
|
||||||
EPOCH_SIZE=$2
|
EPOCH_SIZE=$2
|
||||||
DATA_PATH=$3
|
DATA_PATH=$3
|
||||||
|
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
|
||||||
export RANK_TABLE_FILE=$4
|
export RANK_TABLE_FILE=$4
|
||||||
export RANK_SIZE=$1
|
export RANK_SIZE=$1
|
||||||
export HCCL_FLAG=1
|
export HCCL_FLAG=1
|
||||||
|
|
|
@ -21,7 +21,7 @@ echo "After running the script, the network runs in the background, The log will
|
||||||
|
|
||||||
export RANK_SIZE=$1
|
export RANK_SIZE=$1
|
||||||
DATA_URL=$2
|
DATA_URL=$2
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$3
|
export RANK_TABLE_FILE=$3
|
||||||
|
|
||||||
for ((i=0; i<RANK_SIZE;i++))
|
for ((i=0; i<RANK_SIZE;i++))
|
||||||
do
|
do
|
||||||
|
|
|
@ -22,7 +22,6 @@ export RANK_SIZE=$1
|
||||||
export EPOCH_SIZE=$2
|
export EPOCH_SIZE=$2
|
||||||
export DATASET=$3
|
export DATASET=$3
|
||||||
export RANK_TABLE_FILE=$4
|
export RANK_TABLE_FILE=$4
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
|
||||||
|
|
||||||
for((i=0;i<$RANK_SIZE;i++));
|
for((i=0;i<$RANK_SIZE;i++));
|
||||||
do
|
do
|
||||||
|
|
|
@ -22,7 +22,6 @@ export RANK_SIZE=$1
|
||||||
export EPOCH_SIZE=$2
|
export EPOCH_SIZE=$2
|
||||||
export DATASET=$3
|
export DATASET=$3
|
||||||
export RANK_TABLE_FILE=$4
|
export RANK_TABLE_FILE=$4
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
|
||||||
|
|
||||||
for((i=0;i<$RANK_SIZE;i++));
|
for((i=0;i<$RANK_SIZE;i++));
|
||||||
do
|
do
|
||||||
|
|
|
@ -21,7 +21,6 @@ export RANK_SIZE=$1
|
||||||
export EPOCH_SIZE=$2
|
export EPOCH_SIZE=$2
|
||||||
export DATASET=$3
|
export DATASET=$3
|
||||||
export RANK_TABLE_FILE=$4
|
export RANK_TABLE_FILE=$4
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
|
||||||
|
|
||||||
export MS_COMM_TYPE=zmq
|
export MS_COMM_TYPE=zmq
|
||||||
export MS_SCHED_NUM=1
|
export MS_SCHED_NUM=1
|
||||||
|
|
|
@ -29,7 +29,7 @@ EPOCH_SIZE=$2
|
||||||
VOCAB_SIZE=$3
|
VOCAB_SIZE=$3
|
||||||
EMB_DIM=$4
|
EMB_DIM=$4
|
||||||
DATASET=$5
|
DATASET=$5
|
||||||
MINDSPORE_HCCL_CONFIG_PATH=$6
|
RANK_TABLE_FILE=$6
|
||||||
ENV_SH=$7
|
ENV_SH=$7
|
||||||
MODE=$8
|
MODE=$8
|
||||||
|
|
||||||
|
@ -39,7 +39,7 @@ do
|
||||||
passwd=$(get_node_passwd ${cluster_config_path} ${node})
|
passwd=$(get_node_passwd ${cluster_config_path} ${node})
|
||||||
echo "------------------${user}@${node}---------------------"
|
echo "------------------${user}@${node}---------------------"
|
||||||
if [ $MODE == "host_device_mix" ]; then
|
if [ $MODE == "host_device_mix" ]; then
|
||||||
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${MINDSPORE_HCCL_CONFIG_PATH}"
|
ssh_pass ${node} ${user} ${passwd} "mkdir -p ${execute_path}; cd ${execute_path}; bash ${SCRIPTPATH}/run_auto_parallel_train_cluster.sh ${RANK_SIZE} ${RANK_START} ${EPOCH_SIZE} ${VOCAB_SIZE} ${EMB_DIM} ${DATASET} ${ENV_SH} ${MODE} ${RANK_TABLE_FILE}"
|
||||||
else
|
else
|
||||||
echo "[ERROR] mode is wrong"
|
echo "[ERROR] mode is wrong"
|
||||||
exit 1
|
exit 1
|
||||||
|
|
|
@ -140,7 +140,7 @@ def main():
|
||||||
env['RANK_ID'] = str(rank_id)
|
env['RANK_ID'] = str(rank_id)
|
||||||
env['DEVICE_ID'] = str(device_id)
|
env['DEVICE_ID'] = str(device_id)
|
||||||
if args.nproc_per_node > 1:
|
if args.nproc_per_node > 1:
|
||||||
env['MINDSPORE_HCCL_CONFIG_PATH'] = table_fn
|
env['RANK_TABLE_FILE'] = table_fn
|
||||||
env['RANK_TABLE_FILE'] = table_fn
|
env['RANK_TABLE_FILE'] = table_fn
|
||||||
if os.path.exists(device_dir):
|
if os.path.exists(device_dir):
|
||||||
shutil.rmtree(device_dir)
|
shutil.rmtree(device_dir)
|
||||||
|
|
|
@ -74,7 +74,7 @@ Parameters for both training and inference can be set in config.py.
|
||||||
|
|
||||||
```
|
```
|
||||||
# distributed training
|
# distributed training
|
||||||
Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]
|
Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]
|
||||||
```
|
```
|
||||||
|
|
||||||
|
|
||||||
|
|
|
@ -16,13 +16,13 @@
|
||||||
|
|
||||||
if [ $# != 3 ]
|
if [ $# != 3 ]
|
||||||
then
|
then
|
||||||
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [DEVICE_NUM]"
|
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [DEVICE_NUM]"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ ! -f $1 ]
|
if [ ! -f $1 ]
|
||||||
then
|
then
|
||||||
echo "error: DMINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
|
echo "error: DRANK_TABLE_FILE=$1 is not a file"
|
||||||
exit 1
|
exit 1
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
@ -38,7 +38,7 @@ cd $BASE_PATH/../ || exit
|
||||||
ulimit -u unlimited
|
ulimit -u unlimited
|
||||||
export DEVICE_NUM=$3
|
export DEVICE_NUM=$3
|
||||||
export RANK_SIZE=$3
|
export RANK_SIZE=$3
|
||||||
export MINDSPORE_HCCL_CONFIG_PATH=$1
|
export RANK_TABLE_FILE=$1
|
||||||
|
|
||||||
for((i=0; i<${DEVICE_NUM}; i++))
|
for((i=0; i<${DEVICE_NUM}; i++))
|
||||||
do
|
do
|
||||||
|
|
Loading…
Reference in New Issue