From fedaadcf9ed290e2adeb9345089a9af10994c8e3 Mon Sep 17 00:00:00 2001 From: huangxinjing Date: Mon, 16 Aug 2021 16:09:53 +0800 Subject: [PATCH] Add per batch size for the gpu --- model_zoo/official/nlp/pangu_alpha/README.md | 3 ++- .../nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh | 8 +++++--- 2 files changed, 7 insertions(+), 4 deletions(-) diff --git a/model_zoo/official/nlp/pangu_alpha/README.md b/model_zoo/official/nlp/pangu_alpha/README.md index 3aa82373b85..b5018d2700e 100644 --- a/model_zoo/official/nlp/pangu_alpha/README.md +++ b/model_zoo/official/nlp/pangu_alpha/README.md @@ -179,12 +179,13 @@ https:gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools. The script will launch the GPU training through `mpirun`, the user can run the following command on any machine to start training. ```bash -bash scripts/run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET MOD +bash scripts/run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET PER_BATCH MOD ``` - RANK_SIZE: The device number. This can be your total device numbers. For example, 8, 16, 32 ... - HOSTFILE: It's a text file describes the host ip and its devices. Please see our [tutorial](https://www.mindspore.cn/docs/programming_guide/en/master/distributed_training_gpu.html) or [OpenMPI](https://www.open-mpi.org/) for more details. - DATASET: The path to the mindrecord files's parent directory . For example: `/home/work/mindrecord/`. +- PER_BATCH: The batch size for each data parallel-way. - MODE: Can be `2.6B`, `13B` and `200B`. ### Incremental Training diff --git a/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh b/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh index 3fd78dd5d90..fe1dee3abfd 100644 --- a/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh @@ -16,8 +16,8 @@ echo "==============================================================================================================" echo "Please run the script as: " -echo "bash run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET MODE" -echo "for example: bash run_distributed_train_gpu.sh 16 hostfile_16p /mass_dataset/train_data/ 2.6B" +echo "bash run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET PER_BATCH MODE" +echo "for example: bash run_distributed_train_gpu.sh 16 hostfile_16p /mass_dataset/train_data/ 16 2.6B" echo "It is better to use absolute path." echo "==============================================================================================================" @@ -26,7 +26,8 @@ self_path=$(dirname "${script_self}") RANK_SIZE=$1 HOSTFILE=$2 DATASET=$3 -MODE=$4 +PER_BATCH=$4 +MODE=$5 mpirun --allow-run-as-root -x PATH -x LD_LIBRARY_PATH -x PYTHONPATH -x NCCL_DEBUG -x GLOG_v -n $RANK_SIZE --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout \ python -s ${self_path}/../train.py \ @@ -35,4 +36,5 @@ mpirun --allow-run-as-root -x PATH -x LD_LIBRARY_PATH -x PYTHONPATH -x NCCL_DEBU --device_target="GPU" \ --data_url=$DATASET \ --mode=$MODE \ + --per_batch_size=$PER_BATCH \ --run_type=train > train_log.txt 2>&1 &