From 845ff342eda0f7760d0465bdd382a131db34cc7f Mon Sep 17 00:00:00 2001
From: linqingke <linqingke@huawei.com>
Date: Fri, 4 Sep 2020 16:14:38 +0800
Subject: [PATCH] Save the GPU backend multi card output in different folders.

---
 model_zoo/official/cv/faster_rcnn/README.md   |  2 +-
 .../cv/faster_rcnn/src/network_define.py      |  5 +++--
 model_zoo/official/cv/faster_rcnn/train.py    |  2 +-
 .../cv/googlenet/scripts/run_train_gpu.sh     |  2 +-
 .../scripts/run_distribute_train_gpu.sh       |  3 ++-
 model_zoo/official/cv/maskrcnn/README.md      |  2 +-
 .../cv/maskrcnn/src/network_define.py         |  5 +++--
 model_zoo/official/cv/maskrcnn/train.py       |  2 +-
 .../cv/mobilenetv2/scripts/run_train.sh       |  2 +-
 .../cv/mobilenetv2_quant/scripts/run_train.sh |  6 +++---
 .../cv/mobilenetv3/scripts/run_train.sh       |  2 +-
 .../scripts/run_distribute_train_for_gpu.sh   |  3 ++-
 .../scripts/run_distribute_train_gpu.sh       |  4 ++--
 .../scripts/run_parameter_server_train_gpu.sh | 12 +++++------
 .../scripts/run_distribute_train_gpu.sh       |  7 ++++---
 .../scripts/run_distribute_train_for_gpu.sh   |  2 +-
 model_zoo/official/cv/shufflenetv2/Readme.md  | 16 +++++++--------
 .../scripts/run_distribute_train_for_gpu.sh   |  2 +-
 .../vgg16/scripts/run_distribute_train_gpu.sh |  3 ++-
 .../scripts/run_distribute_train_for_gpu.sh   |  2 +-
 .../scripts/run_distribute_train_gpu.sh       |  3 ++-
 .../run_distributed_pretrain_for_gpu.sh       |  2 +-
 .../official/nlp/mass/scripts/run_gpu.sh      |  3 ++-
 .../nlp/mass/src/utils/loss_monitor.py        |  5 +++--
 model_zoo/official/nlp/mass/train.py          | 20 +++++++++++--------
 .../scripts/run_distributed_gd_gpu.sh         |  2 +-
 model_zoo/official/nlp/transformer/train.py   |  7 ++++---
 .../scripts/run_distribute_train_gpu.sh       |  2 +-
 .../script/run_multigpu_train.sh              |  2 +-
 .../script/run_multigpu_train_host_device.sh  |  2 +-
 30 files changed, 73 insertions(+), 59 deletions(-)

diff --git a/model_zoo/official/cv/faster_rcnn/README.md b/model_zoo/official/cv/faster_rcnn/README.md
index c73c855c352..085118239b9 100644
--- a/model_zoo/official/cv/faster_rcnn/README.md
+++ b/model_zoo/official/cv/faster_rcnn/README.md
@@ -154,7 +154,7 @@ sh run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
 
 ### Result
  
-Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log.
+Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log.
 
  
 ```
diff --git a/model_zoo/official/cv/faster_rcnn/src/network_define.py b/model_zoo/official/cv/faster_rcnn/src/network_define.py
index bfa498bccc0..658d5c0876a 100644
--- a/model_zoo/official/cv/faster_rcnn/src/network_define.py
+++ b/model_zoo/official/cv/faster_rcnn/src/network_define.py
@@ -39,7 +39,7 @@ class LossCallBack(Callback):
         per_print_times (int): Print loss every times. Default: 1.
     """
 
-    def __init__(self, per_print_times=1):
+    def __init__(self, per_print_times=1, rank_id=0):
         super(LossCallBack, self).__init__()
         if not isinstance(per_print_times, int) or per_print_times < 0:
             raise ValueError("print_step must be int and >= 0.")
@@ -51,6 +51,7 @@ class LossCallBack(Callback):
         self.rpn_reg_loss_sum = 0
         self.rcnn_cls_loss_sum = 0
         self.rcnn_reg_loss_sum = 0
+        self.rank_id = rank_id
 
         global time_stamp_init, time_stamp_first
         if not time_stamp_init:
@@ -91,7 +92,7 @@ class LossCallBack(Callback):
 
             total_loss = rpn_loss + rcnn_loss
 
-            loss_file = open("./loss.log", "a+")
+            loss_file = open("./loss_{}.log".format(self.rank_id), "a+")
             loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, "
                             "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, total_loss: %.5f" %
                             (time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch,
diff --git a/model_zoo/official/cv/faster_rcnn/train.py b/model_zoo/official/cv/faster_rcnn/train.py
index 53238a0dd75..7da7dce2750 100644
--- a/model_zoo/official/cv/faster_rcnn/train.py
+++ b/model_zoo/official/cv/faster_rcnn/train.py
@@ -131,7 +131,7 @@ if __name__ == '__main__':
         net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale)
 
     time_cb = TimeMonitor(data_size=dataset_size)
-    loss_cb = LossCallBack()
+    loss_cb = LossCallBack(rank_id=rank)
     cb = [time_cb, loss_cb]
     if config.save_checkpoint:
         ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
diff --git a/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh b/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh
index b30160238d0..6357b395185 100644
--- a/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh
+++ b/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh
@@ -44,7 +44,7 @@ export CUDA_VISIBLE_DEVICES="$2"
 
 if [ $1 -gt 1 ]
 then
-    mpirun -n $1 --allow-run-as-root \
+    mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
     python3 ${BASEPATH}/../train.py > train.log 2>&1 &
 else
     python3 ${BASEPATH}/../train.py > train.log 2>&1 &
diff --git a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh
index 305f1dcfff5..0ddfde76b91 100644
--- a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh
@@ -14,4 +14,5 @@
 # limitations under the License.
 # ============================================================================
 DATA_DIR=$1
-mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
+mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \
+  python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
diff --git a/model_zoo/official/cv/maskrcnn/README.md b/model_zoo/official/cv/maskrcnn/README.md
index df4e699d2b5..25ec1832daa 100644
--- a/model_zoo/official/cv/maskrcnn/README.md
+++ b/model_zoo/official/cv/maskrcnn/README.md
@@ -329,7 +329,7 @@ sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
 
 ### [Training Result](#content)
 
-Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log.
+Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log.
 
  
 ```
diff --git a/model_zoo/official/cv/maskrcnn/src/network_define.py b/model_zoo/official/cv/maskrcnn/src/network_define.py
index 1229d13f4c7..841f0fc3eb1 100644
--- a/model_zoo/official/cv/maskrcnn/src/network_define.py
+++ b/model_zoo/official/cv/maskrcnn/src/network_define.py
@@ -40,7 +40,7 @@ class LossCallBack(Callback):
         per_print_times (int): Print loss every times. Default: 1.
     """
 
-    def __init__(self, per_print_times=1):
+    def __init__(self, per_print_times=1, rank_id=0):
         super(LossCallBack, self).__init__()
         if not isinstance(per_print_times, int) or per_print_times < 0:
             raise ValueError("print_step must be int and >= 0.")
@@ -53,6 +53,7 @@ class LossCallBack(Callback):
         self.rcnn_cls_loss_sum = 0
         self.rcnn_reg_loss_sum = 0
         self.rcnn_mask_loss_sum = 0
+        self.rank_id = rank_id
 
         global time_stamp_init, time_stamp_first
         if not time_stamp_init:
@@ -96,7 +97,7 @@ class LossCallBack(Callback):
 
             total_loss = rpn_loss + rcnn_loss
 
-            loss_file = open("./loss.log", "a+")
+            loss_file = open("./loss_{}.log".format(self.rank_id), "a+")
             loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, "
                             "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, rcnn_mask_loss: %.5f, "
                             "total_loss: %.5f" %
diff --git a/model_zoo/official/cv/maskrcnn/train.py b/model_zoo/official/cv/maskrcnn/train.py
index 0081cec6d3c..cbe0a492a97 100644
--- a/model_zoo/official/cv/maskrcnn/train.py
+++ b/model_zoo/official/cv/maskrcnn/train.py
@@ -126,7 +126,7 @@ if __name__ == '__main__':
             net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale)
 
         time_cb = TimeMonitor(data_size=dataset_size)
-        loss_cb = LossCallBack()
+        loss_cb = LossCallBack(rank_id=rank)
         cb = [time_cb, loss_cb]
         if config.save_checkpoint:
             ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh
index c7eea7ef965..0680c3d6ca2 100644
--- a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh
@@ -72,7 +72,7 @@ run_gpu()
     cd ../train || exit
 
     export CUDA_VISIBLE_DEVICES="$3"
-    mpirun -n $2 --allow-run-as-root \
+    mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
     python ${BASEPATH}/../train.py \
         --platform=$1 \
         --dataset_path=$4 \
diff --git a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
index 51abcda0bd9..a2732619309 100644
--- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh
@@ -277,14 +277,14 @@ run_gpu(){
     env > env.log
     if [ $# == 3 ]
     then
-        mpirun --allow-run-as-root -n ${RANK_SIZE} \
+        mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \
         python train.py --device_target=$1  --dataset_path=$PATH1 &> train.log &
     fi
     
     if [ $# == 4 ]
     then
-        mpirun --allow-run-as-root -n ${RANK_SIZE} \
-        python train.py --device_target=$1  --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log &
+        mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \
+        python train.py --device_traget=$1  --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log &
     fi
 
     cd ..
diff --git a/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh
index 25c0667a590..e9f1ac745dd 100644
--- a/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh
+++ b/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh
@@ -38,7 +38,7 @@ run_gpu()
     cd ../train || exit
 
     export CUDA_VISIBLE_DEVICES="$3"
-    mpirun -n $2 --allow-run-as-root \
+    mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
     python ${BASEPATH}/../train.py \
         --dataset_path=$4 \
         --device_target=$1 \
diff --git a/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh
index 305f1dcfff5..0ddfde76b91 100755
--- a/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh
+++ b/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh
@@ -14,4 +14,5 @@
 # limitations under the License.
 # ============================================================================
 DATA_DIR=$1
-mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
+mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \
+  python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
diff --git a/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh
index d8e1139b637..ff3b023536d 100755
--- a/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh
@@ -80,14 +80,14 @@ cd ./train_parallel || exit
 
 if [ $# == 3 ]
 then	    
-        mpirun --allow-run-as-root -n $RANK_SIZE \
+        mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
 	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
 fi
     
 if [ $# == 4 ]
 then
-        mpirun --allow-run-as-root -n $RANK_SIZE \
+        mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
           python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
 fi
diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh
index ecbb345eed4..1941cc234ff 100755
--- a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh
+++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh
@@ -86,14 +86,14 @@ cp -r ../src ./sched
 cd ./sched || exit
 if [ $# == 3 ]
 then	    
-        mpirun --allow-run-as-root -n 1 \
+        mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
 	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log &
 fi
     
 if [ $# == 4 ]
 then
-        mpirun --allow-run-as-root -n 1 \
+        mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
           python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log &
 fi
@@ -111,14 +111,14 @@ do
   cd ./server_$i || exit
   if [ $# == 3 ]
   then	    
-          mpirun --allow-run-as-root -n 1 \
+          mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
           python train.py --net=$1 --dataset=$2 --run_distribute=True \
           --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log &
   fi
       
   if [ $# == 4 ]
   then
-          mpirun --allow-run-as-root -n 1 \
+          mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
             python train.py --net=$1 --dataset=$2 --run_distribute=True \
       --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log &
   fi
@@ -134,14 +134,14 @@ cp -r ../src ./worker
 cd ./worker || exit
 if [ $# == 3 ]
 then	    
-        mpirun --allow-run-as-root -n $RANK_SIZE \
+        mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
 	      python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	      --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log &
 fi
     
 if [ $# == 4 ]
 then
-        mpirun --allow-run-as-root -n $RANK_SIZE \
+        mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
           python train.py --net=$1 --dataset=$2 --run_distribute=True \
 	  --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log &
 fi
diff --git a/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh
index a5799c71cd4..ffe821c2968 100755
--- a/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh
@@ -41,6 +41,7 @@ cp *.sh ./train_parallel
 cp -r ../src ./train_parallel
 cd ./train_parallel || exit
 
-mpirun --allow-run-as-root -n $RANK_SIZE \
-python train.py --run_distribute=True \
---device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
\ No newline at end of file
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
+  python train.py --run_distribute=True \
+    --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
+
diff --git a/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh
index 6ab980a0fad..aa8ca74462a 100644
--- a/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh
+++ b/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh
@@ -22,7 +22,7 @@ then
 	PATH_CHECKPOINT=$2
 fi
 
-mpirun --allow-run-as-root -n $RANK_SIZE \
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
 	python train.py  \
     --is_distribute=1 \
     --platform="GPU" \
diff --git a/model_zoo/official/cv/shufflenetv2/Readme.md b/model_zoo/official/cv/shufflenetv2/Readme.md
index ff83b656d4e..b975d951313 100644
--- a/model_zoo/official/cv/shufflenetv2/Readme.md
+++ b/model_zoo/official/cv/shufflenetv2/Readme.md
@@ -54,14 +54,14 @@ Dataset used: [imagenet](http://www.image-net.org/)
 +-- ShuffleNetV2      
   +-- Readme.md     # descriptions about ShuffleNetV2
   +-- scripts
-  �   +--run_distribute_train_for_gpu.sh   # shell script for distributed training
-  �   +--run_eval_for_gpu.sh         # shell script for evaluation
-  �   +--run_standalone_train_for_gpu.sh   # shell script for standalone training
+    +--run_distribute_train_for_gpu.sh   # shell script for distributed training
+    +--run_eval_for_gpu.sh         # shell script for evaluation
+    +--run_standalone_train_for_gpu.sh   # shell script for standalone training
   +-- src
-  �   +--config.py      # parameter configuration
-  �   +--dataset.py     # creating dataset
-  �   +--loss.py        # loss function for network
-  �   +--lr_generator.py     # learning rate config
+    +--config.py      # parameter configuration
+    +--dataset.py     # creating dataset
+    +--loss.py        # loss function for network
+    +--lr_generator.py     # learning rate config
   +-- train.py      # training script
   +-- eval.py       # evaluation script
   +-- blocks.py     # ShuffleNetV2 blocks
@@ -83,7 +83,7 @@ You can start training using python or shell scripts. The usage of shell scripts
 ```
 # training example
   python:
-      GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 &
+      GPU: mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 &
 
   shell:
       GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
diff --git a/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh
index ec03ea7bfe2..edea873b64e 100644
--- a/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh
+++ b/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh
@@ -50,7 +50,7 @@ export CUDA_VISIBLE_DEVICES="$2"
 
 if [ $# == 3 ]
 then
-    mpirun -n $1 --allow-run-as-root \
+    mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
     python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 &
 fi
 
diff --git a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh
index 2784822d430..bdab8ff7b25 100644
--- a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh
@@ -22,7 +22,8 @@ echo "==========================================================================
 
 DATA_PATH=$1
 
-mpirun -n 8 python train.py  \
+mpirun -n 8 --output-filename log_output --merge-stderr-to-stdout \
+  python train.py  \
     --device_target="GPU" \
     --dataset="imagenet2012" \
     --is_distributed=1 \
diff --git a/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh
index 86d951f6633..816079871c0 100644
--- a/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh
+++ b/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh
@@ -44,7 +44,7 @@ cp ../*.py ./distribute_train
 cp -r ../src ./distribute_train
 cd ./distribute_train || exit
 
-mpirun --allow-run-as-root -n $RANK_SIZE \
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
   python train.py  \
     --dataset_path=$DATASET_PATH  \
     --platform=GPU  \
diff --git a/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh
index b54070844cb..41d00b30513 100644
--- a/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh
@@ -53,7 +53,8 @@ cp ../*.py ./train_parallel
 cp -r ../src ./train_parallel
 cd ./train_parallel || exit
 env > env.log
-mpirun --allow-run-as-root -n ${DEVICE_NUM} python train.py \
+mpirun --allow-run-as-root -n ${DEVICE_NUM} --output-filename log_output --merge-stderr-to-stdout \
+python train.py \
     --data_dir=$DATASET_PATH \
     --pretrained_backbone=$PRETRAINED_BACKBONE \
     --device_target=GPU \
diff --git a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
index 7ca644652ea..9fbc156b665 100644
--- a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh
@@ -26,7 +26,7 @@ EPOCH_SIZE=$2
 DATA_DIR=$3
 SCHEMA_DIR=$4
 
-mpirun --allow-run-as-root -n $RANK_SIZE \
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
 	python run_pretrain.py				\
 		--device_target="GPU"			\
 		--distribute="true"				\
diff --git a/model_zoo/official/nlp/mass/scripts/run_gpu.sh b/model_zoo/official/nlp/mass/scripts/run_gpu.sh
index aae93bbac7c..053b2d9b651 100644
--- a/model_zoo/official/nlp/mass/scripts/run_gpu.sh
+++ b/model_zoo/official/nlp/mass/scripts/run_gpu.sh
@@ -146,7 +146,8 @@ if [ "$task" == "train" ]
 then
   if [ $RANK_SIZE -gt 1 ]
     then
-      mpirun -n $RANK_SIZE python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
+      mpirun -n $RANK_SIZE --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
+      python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
     fi
   python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
 elif [ "$task" == "infer" ]
diff --git a/model_zoo/official/nlp/mass/src/utils/loss_monitor.py b/model_zoo/official/nlp/mass/src/utils/loss_monitor.py
index c62230d6ae3..1d3467363af 100644
--- a/model_zoo/official/nlp/mass/src/utils/loss_monitor.py
+++ b/model_zoo/official/nlp/mass/src/utils/loss_monitor.py
@@ -33,12 +33,13 @@ class LossCallBack(Callback):
     time_stamp_init = False
     time_stamp_first = 0
 
-    def __init__(self, config: TransformerConfig, per_print_times: int = 1):
+    def __init__(self, config: TransformerConfig, per_print_times: int = 1, rank_id: int = 0):
         super(LossCallBack, self).__init__()
         if not isinstance(per_print_times, int) or per_print_times < 0:
             raise ValueError("print_step must be int and >= 0.")
         self.config = config
         self._per_print_times = per_print_times
+        self.rank_id = rank_id
 
         if not self.time_stamp_init:
             self.time_stamp_first = self._get_ms_timestamp()
@@ -46,7 +47,7 @@ class LossCallBack(Callback):
 
     def step_end(self, run_context):
         cb_params = run_context.original_args()
-        file_name = "./loss.log"
+        file_name = "./loss_{}.log".format(self.rank_id)
         with open(file_name, "a+") as f:
             time_stamp_current = self._get_ms_timestamp()
             f.write("time: {}, epoch: {}, step: {}, outputs are {},{},{}.\n".format(
diff --git a/model_zoo/official/nlp/mass/train.py b/model_zoo/official/nlp/mass/train.py
index 80ed331b54f..e78cd1b1da7 100644
--- a/model_zoo/official/nlp/mass/train.py
+++ b/model_zoo/official/nlp/mass/train.py
@@ -199,24 +199,28 @@ def _build_training_pipeline(config: TransformerConfig,
                                                               scale_update_cell=scale_manager.get_update_cell())
     net_with_grads.set_train(True)
     model = Model(net_with_grads)
-    loss_monitor = LossCallBack(config)
     ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps,
                                    keep_checkpoint_max=config.keep_ckpt_max)
 
     rank_size = os.getenv('RANK_SIZE')
-    callbacks = [loss_monitor]
-    if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
-        ckpt_callback = ModelCheckpoint(
-            prefix=config.ckpt_prefix,
-            directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
-            config=ckpt_config)
-        callbacks.append(ckpt_callback)
+    callbacks = []
+    if rank_size is not None and int(rank_size) > 1:
+        loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank())
+        callbacks.append(loss_monitor)
+        if MultiAscend.get_rank() % 8 == 0:
+            ckpt_callback = ModelCheckpoint(
+                prefix=config.ckpt_prefix,
+                directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiAscend.get_rank())),
+                config=ckpt_config)
+            callbacks.append(ckpt_callback)
 
     if rank_size is None or int(rank_size) == 1:
         ckpt_callback = ModelCheckpoint(
             prefix=config.ckpt_prefix,
             directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
             config=ckpt_config)
+        loss_monitor = LossCallBack(config, rank_id=os.getenv('DEVICE_ID'))
+        callbacks.append(loss_monitor)
         callbacks.append(ckpt_callback)
 
     print(f" | ALL SET, PREPARE TO TRAIN.")
diff --git a/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh
index d09f49760da..987d1fe6f85 100644
--- a/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh
+++ b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh
@@ -29,7 +29,7 @@ TEACHER_CKPT_PATH=$5
 
 PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
 
-mpirun --allow-run-as-root -n $RANK_SIZE \
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
 	python ${PROJECT_DIR}/../run_general_distill.py  \
 	--distribute="true" \
 	--device_target="GPU" \
diff --git a/model_zoo/official/nlp/transformer/train.py b/model_zoo/official/nlp/transformer/train.py
index a1d040fad4c..60a8535699c 100644
--- a/model_zoo/official/nlp/transformer/train.py
+++ b/model_zoo/official/nlp/transformer/train.py
@@ -53,11 +53,12 @@ class LossCallBack(Callback):
     Args:
         per_print_times (int): Print loss every times. Default: 1.
     """
-    def __init__(self, per_print_times=1):
+    def __init__(self, per_print_times=1, rank_id=0):
         super(LossCallBack, self).__init__()
         if not isinstance(per_print_times, int) or per_print_times < 0:
             raise ValueError("print_step must be int and >= 0.")
         self._per_print_times = per_print_times
+        self.rank_id = rank_id
         global time_stamp_init, time_stamp_first
         if not time_stamp_init:
             time_stamp_first = get_ms_timestamp()
@@ -71,7 +72,7 @@ class LossCallBack(Callback):
         print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
                                                                      cb_params.cur_epoch_num, cb_params.cur_step_num,
                                                                      str(cb_params.net_outputs)))
-        with open("./loss.log", "a+") as f:
+        with open("./loss_{}.log".fromat(self.rank_id), "a+") as f:
             f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
                                                                            cb_params.cur_epoch_num,
                                                                            cb_params.cur_step_num,
@@ -145,7 +146,7 @@ def run_transformer_train():
                                   min_lr=cfg.lr_schedule.min_lr), mstype.float32)
     optimizer = Adam(netwithloss.trainable_params(), lr)
 
-    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()]
+    callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)]
     if args.enable_save_ckpt == "true":
         if device_num == 1 or (device_num > 1 and rank_id == 0):
             ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
diff --git a/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh b/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh
index 832cc409d48..7cf8f513c6c 100644
--- a/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh
@@ -28,7 +28,7 @@ cp *.py ./log
 cp -r src ./log
 cd ./log || exit
 env > env.log
-mpirun --allow-run-as-root -n $RANK_SIZE \
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
   python -u train.py \
     --dataset_path=$DATA_URL \
     --ckpt_path="checkpoint" \
diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh
index ae8d31e36ec..9403ad685ee 100644
--- a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh
@@ -21,7 +21,7 @@ RANK_SIZE=$1
 EPOCH_SIZE=$2
 DATASET=$3
 
-mpirun --allow-run-as-root -n $RANK_SIZE                    \
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
     python -s ${self_path}/../train_and_eval_distribute.py  \
         --device_target="GPU"                               \
         --data_path=$DATASET                                \
diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh
index 772e01f767d..475b3184588 100644
--- a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh
+++ b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh
@@ -23,7 +23,7 @@ DATASET=$3
 VOCAB_SIZE=$4
 EMB_DIM=$5
 
-mpirun --allow-run-as-root -n $RANK_SIZE                       \
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
     python -s ${self_path}/../train_and_eval_auto_parallel.py  \
         --device_target="GPU"                                  \
         --data_path=$DATASET                                   \