From fedaadcf9ed290e2adeb9345089a9af10994c8e3 Mon Sep 17 00:00:00 2001
From: huangxinjing <huangxinjing@huawei.com>
Date: Mon, 16 Aug 2021 16:09:53 +0800
Subject: [PATCH] Add per batch size for the gpu

---
 model_zoo/official/nlp/pangu_alpha/README.md              | 3 ++-
 .../nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh   | 8 +++++---
 2 files changed, 7 insertions(+), 4 deletions(-)

diff --git a/model_zoo/official/nlp/pangu_alpha/README.md b/model_zoo/official/nlp/pangu_alpha/README.md
index 3aa82373b85..b5018d2700e 100644
--- a/model_zoo/official/nlp/pangu_alpha/README.md
+++ b/model_zoo/official/nlp/pangu_alpha/README.md
@@ -179,12 +179,13 @@ https:gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools.
 The script will launch the GPU training through `mpirun`, the user can run the following command on any machine to start training.
 
 ```bash
-bash scripts/run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET MOD
+bash scripts/run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET PER_BATCH MOD
 ```
 
 - RANK_SIZE: The device number. This can be your total device numbers. For example, 8, 16, 32 ...
 - HOSTFILE:  It's a text file describes the host ip and its devices. Please see our [tutorial](https://www.mindspore.cn/docs/programming_guide/en/master/distributed_training_gpu.html) or [OpenMPI](https://www.open-mpi.org/) for more details.
 - DATASET: The path to the mindrecord files's parent directory . For example: `/home/work/mindrecord/`.
+- PER_BATCH: The batch size for each data parallel-way.
 - MODE: Can be `2.6B`, `13B` and `200B`.
 
 ### Incremental Training
diff --git a/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh b/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh
index 3fd78dd5d90..fe1dee3abfd 100644
--- a/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/nlp/pangu_alpha/scripts/run_distribute_train_gpu.sh
@@ -16,8 +16,8 @@
 
 echo "=============================================================================================================="
 echo "Please run the script as: "
-echo "bash run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET MODE"
-echo "for example: bash run_distributed_train_gpu.sh 16 hostfile_16p /mass_dataset/train_data/ 2.6B"
+echo "bash run_distributed_train_gpu.sh RANK_SIZE HOSTFILE DATASET PER_BATCH MODE"
+echo "for example: bash run_distributed_train_gpu.sh 16 hostfile_16p /mass_dataset/train_data/ 16 2.6B"
 echo "It is better to use absolute path."
 echo "=============================================================================================================="
 
@@ -26,7 +26,8 @@ self_path=$(dirname "${script_self}")
 RANK_SIZE=$1
 HOSTFILE=$2
 DATASET=$3
-MODE=$4
+PER_BATCH=$4
+MODE=$5
 
 mpirun --allow-run-as-root -x PATH -x LD_LIBRARY_PATH -x PYTHONPATH -x NCCL_DEBUG -x GLOG_v -n $RANK_SIZE --hostfile $HOSTFILE --output-filename log_output --merge-stderr-to-stdout \
     python -s ${self_path}/../train.py  \
@@ -35,4 +36,5 @@ mpirun --allow-run-as-root -x PATH -x LD_LIBRARY_PATH -x PYTHONPATH -x NCCL_DEBU
       --device_target="GPU"             \
       --data_url=$DATASET               \
       --mode=$MODE                      \
+      --per_batch_size=$PER_BATCH       \
       --run_type=train > train_log.txt 2>&1 &