From 845ff342eda0f7760d0465bdd382a131db34cc7f Mon Sep 17 00:00:00 2001 From: linqingke Date: Fri, 4 Sep 2020 16:14:38 +0800 Subject: [PATCH] Save the GPU backend multi card output in different folders. --- model_zoo/official/cv/faster_rcnn/README.md | 2 +- .../cv/faster_rcnn/src/network_define.py | 5 +++-- model_zoo/official/cv/faster_rcnn/train.py | 2 +- .../cv/googlenet/scripts/run_train_gpu.sh | 2 +- .../scripts/run_distribute_train_gpu.sh | 3 ++- model_zoo/official/cv/maskrcnn/README.md | 2 +- .../cv/maskrcnn/src/network_define.py | 5 +++-- model_zoo/official/cv/maskrcnn/train.py | 2 +- .../cv/mobilenetv2/scripts/run_train.sh | 2 +- .../cv/mobilenetv2_quant/scripts/run_train.sh | 6 +++--- .../cv/mobilenetv3/scripts/run_train.sh | 2 +- .../scripts/run_distribute_train_for_gpu.sh | 3 ++- .../scripts/run_distribute_train_gpu.sh | 4 ++-- .../scripts/run_parameter_server_train_gpu.sh | 12 +++++------ .../scripts/run_distribute_train_gpu.sh | 7 ++++--- .../scripts/run_distribute_train_for_gpu.sh | 2 +- model_zoo/official/cv/shufflenetv2/Readme.md | 16 +++++++-------- .../scripts/run_distribute_train_for_gpu.sh | 2 +- .../vgg16/scripts/run_distribute_train_gpu.sh | 3 ++- .../scripts/run_distribute_train_for_gpu.sh | 2 +- .../scripts/run_distribute_train_gpu.sh | 3 ++- .../run_distributed_pretrain_for_gpu.sh | 2 +- .../official/nlp/mass/scripts/run_gpu.sh | 3 ++- .../nlp/mass/src/utils/loss_monitor.py | 5 +++-- model_zoo/official/nlp/mass/train.py | 20 +++++++++++-------- .../scripts/run_distributed_gd_gpu.sh | 2 +- model_zoo/official/nlp/transformer/train.py | 7 ++++--- .../scripts/run_distribute_train_gpu.sh | 2 +- .../script/run_multigpu_train.sh | 2 +- .../script/run_multigpu_train_host_device.sh | 2 +- 30 files changed, 73 insertions(+), 59 deletions(-) diff --git a/model_zoo/official/cv/faster_rcnn/README.md b/model_zoo/official/cv/faster_rcnn/README.md index c73c855c352..085118239b9 100644 --- a/model_zoo/official/cv/faster_rcnn/README.md +++ b/model_zoo/official/cv/faster_rcnn/README.md @@ -154,7 +154,7 @@ sh run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] ### Result -Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log. +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log. ``` diff --git a/model_zoo/official/cv/faster_rcnn/src/network_define.py b/model_zoo/official/cv/faster_rcnn/src/network_define.py index bfa498bccc0..658d5c0876a 100644 --- a/model_zoo/official/cv/faster_rcnn/src/network_define.py +++ b/model_zoo/official/cv/faster_rcnn/src/network_define.py @@ -39,7 +39,7 @@ class LossCallBack(Callback): per_print_times (int): Print loss every times. Default: 1. """ - def __init__(self, per_print_times=1): + def __init__(self, per_print_times=1, rank_id=0): super(LossCallBack, self).__init__() if not isinstance(per_print_times, int) or per_print_times < 0: raise ValueError("print_step must be int and >= 0.") @@ -51,6 +51,7 @@ class LossCallBack(Callback): self.rpn_reg_loss_sum = 0 self.rcnn_cls_loss_sum = 0 self.rcnn_reg_loss_sum = 0 + self.rank_id = rank_id global time_stamp_init, time_stamp_first if not time_stamp_init: @@ -91,7 +92,7 @@ class LossCallBack(Callback): total_loss = rpn_loss + rcnn_loss - loss_file = open("./loss.log", "a+") + loss_file = open("./loss_{}.log".format(self.rank_id), "a+") loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, " "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, total_loss: %.5f" % (time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch, diff --git a/model_zoo/official/cv/faster_rcnn/train.py b/model_zoo/official/cv/faster_rcnn/train.py index 53238a0dd75..7da7dce2750 100644 --- a/model_zoo/official/cv/faster_rcnn/train.py +++ b/model_zoo/official/cv/faster_rcnn/train.py @@ -131,7 +131,7 @@ if __name__ == '__main__': net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale) time_cb = TimeMonitor(data_size=dataset_size) - loss_cb = LossCallBack() + loss_cb = LossCallBack(rank_id=rank) cb = [time_cb, loss_cb] if config.save_checkpoint: ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, diff --git a/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh b/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh index b30160238d0..6357b395185 100644 --- a/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh +++ b/model_zoo/official/cv/googlenet/scripts/run_train_gpu.sh @@ -44,7 +44,7 @@ export CUDA_VISIBLE_DEVICES="$2" if [ $1 -gt 1 ] then - mpirun -n $1 --allow-run-as-root \ + mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ python3 ${BASEPATH}/../train.py > train.log 2>&1 & else python3 ${BASEPATH}/../train.py > train.log 2>&1 & diff --git a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh index 305f1dcfff5..0ddfde76b91 100644 --- a/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/inceptionv3/scripts/run_distribute_train_gpu.sh @@ -14,4 +14,5 @@ # limitations under the License. # ============================================================================ DATA_DIR=$1 -mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & +mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \ + python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & diff --git a/model_zoo/official/cv/maskrcnn/README.md b/model_zoo/official/cv/maskrcnn/README.md index df4e699d2b5..25ec1832daa 100644 --- a/model_zoo/official/cv/maskrcnn/README.md +++ b/model_zoo/official/cv/maskrcnn/README.md @@ -329,7 +329,7 @@ sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL] ### [Training Result](#content) -Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log. +Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log. ``` diff --git a/model_zoo/official/cv/maskrcnn/src/network_define.py b/model_zoo/official/cv/maskrcnn/src/network_define.py index 1229d13f4c7..841f0fc3eb1 100644 --- a/model_zoo/official/cv/maskrcnn/src/network_define.py +++ b/model_zoo/official/cv/maskrcnn/src/network_define.py @@ -40,7 +40,7 @@ class LossCallBack(Callback): per_print_times (int): Print loss every times. Default: 1. """ - def __init__(self, per_print_times=1): + def __init__(self, per_print_times=1, rank_id=0): super(LossCallBack, self).__init__() if not isinstance(per_print_times, int) or per_print_times < 0: raise ValueError("print_step must be int and >= 0.") @@ -53,6 +53,7 @@ class LossCallBack(Callback): self.rcnn_cls_loss_sum = 0 self.rcnn_reg_loss_sum = 0 self.rcnn_mask_loss_sum = 0 + self.rank_id = rank_id global time_stamp_init, time_stamp_first if not time_stamp_init: @@ -96,7 +97,7 @@ class LossCallBack(Callback): total_loss = rpn_loss + rcnn_loss - loss_file = open("./loss.log", "a+") + loss_file = open("./loss_{}.log".format(self.rank_id), "a+") loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, " "rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, rcnn_mask_loss: %.5f, " "total_loss: %.5f" % diff --git a/model_zoo/official/cv/maskrcnn/train.py b/model_zoo/official/cv/maskrcnn/train.py index 0081cec6d3c..cbe0a492a97 100644 --- a/model_zoo/official/cv/maskrcnn/train.py +++ b/model_zoo/official/cv/maskrcnn/train.py @@ -126,7 +126,7 @@ if __name__ == '__main__': net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale) time_cb = TimeMonitor(data_size=dataset_size) - loss_cb = LossCallBack() + loss_cb = LossCallBack(rank_id=rank) cb = [time_cb, loss_cb] if config.save_checkpoint: ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size, diff --git a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh index c7eea7ef965..0680c3d6ca2 100644 --- a/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh +++ b/model_zoo/official/cv/mobilenetv2/scripts/run_train.sh @@ -72,7 +72,7 @@ run_gpu() cd ../train || exit export CUDA_VISIBLE_DEVICES="$3" - mpirun -n $2 --allow-run-as-root \ + mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ python ${BASEPATH}/../train.py \ --platform=$1 \ --dataset_path=$4 \ diff --git a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh index 51abcda0bd9..a2732619309 100644 --- a/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh +++ b/model_zoo/official/cv/mobilenetv2_quant/scripts/run_train.sh @@ -277,14 +277,14 @@ run_gpu(){ env > env.log if [ $# == 3 ] then - mpirun --allow-run-as-root -n ${RANK_SIZE} \ + mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \ python train.py --device_target=$1 --dataset_path=$PATH1 &> train.log & fi if [ $# == 4 ] then - mpirun --allow-run-as-root -n ${RANK_SIZE} \ - python train.py --device_target=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log & + mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \ + python train.py --device_traget=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log & fi cd .. diff --git a/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh b/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh index 25c0667a590..e9f1ac745dd 100644 --- a/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh +++ b/model_zoo/official/cv/mobilenetv3/scripts/run_train.sh @@ -38,7 +38,7 @@ run_gpu() cd ../train || exit export CUDA_VISIBLE_DEVICES="$3" - mpirun -n $2 --allow-run-as-root \ + mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ python ${BASEPATH}/../train.py \ --dataset_path=$4 \ --device_target=$1 \ diff --git a/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh index 305f1dcfff5..0ddfde76b91 100755 --- a/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh +++ b/model_zoo/official/cv/nasnet/scripts/run_distribute_train_for_gpu.sh @@ -14,4 +14,5 @@ # limitations under the License. # ============================================================================ DATA_DIR=$1 -mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & +mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \ + python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 & diff --git a/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh index d8e1139b637..ff3b023536d 100755 --- a/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/resnet/scripts/run_distribute_train_gpu.sh @@ -80,14 +80,14 @@ cd ./train_parallel || exit if [ $# == 3 ] then - mpirun --allow-run-as-root -n $RANK_SIZE \ + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & fi if [ $# == 4 ] then - mpirun --allow-run-as-root -n $RANK_SIZE \ + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & fi diff --git a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh index ecbb345eed4..1941cc234ff 100755 --- a/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh +++ b/model_zoo/official/cv/resnet/scripts/run_parameter_server_train_gpu.sh @@ -86,14 +86,14 @@ cp -r ../src ./sched cd ./sched || exit if [ $# == 3 ] then - mpirun --allow-run-as-root -n 1 \ + mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log & fi if [ $# == 4 ] then - mpirun --allow-run-as-root -n 1 \ + mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log & fi @@ -111,14 +111,14 @@ do cd ./server_$i || exit if [ $# == 3 ] then - mpirun --allow-run-as-root -n 1 \ + mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log & fi if [ $# == 4 ] then - mpirun --allow-run-as-root -n 1 \ + mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log & fi @@ -134,14 +134,14 @@ cp -r ../src ./worker cd ./worker || exit if [ $# == 3 ] then - mpirun --allow-run-as-root -n $RANK_SIZE \ + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log & fi if [ $# == 4 ] then - mpirun --allow-run-as-root -n $RANK_SIZE \ + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python train.py --net=$1 --dataset=$2 --run_distribute=True \ --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log & fi diff --git a/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh index a5799c71cd4..ffe821c2968 100755 --- a/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/resnet_thor/scripts/run_distribute_train_gpu.sh @@ -41,6 +41,7 @@ cp *.sh ./train_parallel cp -r ../src ./train_parallel cd ./train_parallel || exit -mpirun --allow-run-as-root -n $RANK_SIZE \ -python train.py --run_distribute=True \ ---device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & \ No newline at end of file +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ + python train.py --run_distribute=True \ + --device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log & + diff --git a/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh index 6ab980a0fad..aa8ca74462a 100644 --- a/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh +++ b/model_zoo/official/cv/resnext50/scripts/run_distribute_train_for_gpu.sh @@ -22,7 +22,7 @@ then PATH_CHECKPOINT=$2 fi -mpirun --allow-run-as-root -n $RANK_SIZE \ +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python train.py \ --is_distribute=1 \ --platform="GPU" \ diff --git a/model_zoo/official/cv/shufflenetv2/Readme.md b/model_zoo/official/cv/shufflenetv2/Readme.md index ff83b656d4e..b975d951313 100644 --- a/model_zoo/official/cv/shufflenetv2/Readme.md +++ b/model_zoo/official/cv/shufflenetv2/Readme.md @@ -54,14 +54,14 @@ Dataset used: [imagenet](http://www.image-net.org/) +-- ShuffleNetV2 +-- Readme.md # descriptions about ShuffleNetV2 +-- scripts - ¦ +--run_distribute_train_for_gpu.sh # shell script for distributed training - ¦ +--run_eval_for_gpu.sh # shell script for evaluation - ¦ +--run_standalone_train_for_gpu.sh # shell script for standalone training + +--run_distribute_train_for_gpu.sh # shell script for distributed training + +--run_eval_for_gpu.sh # shell script for evaluation + +--run_standalone_train_for_gpu.sh # shell script for standalone training +-- src - ¦ +--config.py # parameter configuration - ¦ +--dataset.py # creating dataset - ¦ +--loss.py # loss function for network - ¦ +--lr_generator.py # learning rate config + +--config.py # parameter configuration + +--dataset.py # creating dataset + +--loss.py # loss function for network + +--lr_generator.py # learning rate config +-- train.py # training script +-- eval.py # evaluation script +-- blocks.py # ShuffleNetV2 blocks @@ -83,7 +83,7 @@ You can start training using python or shell scripts. The usage of shell scripts ``` # training example python: - GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 & + GPU: mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 & shell: GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/ diff --git a/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh index ec03ea7bfe2..edea873b64e 100644 --- a/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh +++ b/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh @@ -50,7 +50,7 @@ export CUDA_VISIBLE_DEVICES="$2" if [ $# == 3 ] then - mpirun -n $1 --allow-run-as-root \ + mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 & fi diff --git a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh index 2784822d430..bdab8ff7b25 100644 --- a/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/vgg16/scripts/run_distribute_train_gpu.sh @@ -22,7 +22,8 @@ echo "========================================================================== DATA_PATH=$1 -mpirun -n 8 python train.py \ +mpirun -n 8 --output-filename log_output --merge-stderr-to-stdout \ + python train.py \ --device_target="GPU" \ --dataset="imagenet2012" \ --is_distributed=1 \ diff --git a/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh b/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh index 86d951f6633..816079871c0 100644 --- a/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh +++ b/model_zoo/official/cv/warpctc/scripts/run_distribute_train_for_gpu.sh @@ -44,7 +44,7 @@ cp ../*.py ./distribute_train cp -r ../src ./distribute_train cd ./distribute_train || exit -mpirun --allow-run-as-root -n $RANK_SIZE \ +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python train.py \ --dataset_path=$DATASET_PATH \ --platform=GPU \ diff --git a/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh index b54070844cb..41d00b30513 100644 --- a/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/cv/yolov3_darknet53/scripts/run_distribute_train_gpu.sh @@ -53,7 +53,8 @@ cp ../*.py ./train_parallel cp -r ../src ./train_parallel cd ./train_parallel || exit env > env.log -mpirun --allow-run-as-root -n ${DEVICE_NUM} python train.py \ +mpirun --allow-run-as-root -n ${DEVICE_NUM} --output-filename log_output --merge-stderr-to-stdout \ +python train.py \ --data_dir=$DATASET_PATH \ --pretrained_backbone=$PRETRAINED_BACKBONE \ --device_target=GPU \ diff --git a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh index 7ca644652ea..9fbc156b665 100644 --- a/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh +++ b/model_zoo/official/nlp/bert/scripts/run_distributed_pretrain_for_gpu.sh @@ -26,7 +26,7 @@ EPOCH_SIZE=$2 DATA_DIR=$3 SCHEMA_DIR=$4 -mpirun --allow-run-as-root -n $RANK_SIZE \ +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python run_pretrain.py \ --device_target="GPU" \ --distribute="true" \ diff --git a/model_zoo/official/nlp/mass/scripts/run_gpu.sh b/model_zoo/official/nlp/mass/scripts/run_gpu.sh index aae93bbac7c..053b2d9b651 100644 --- a/model_zoo/official/nlp/mass/scripts/run_gpu.sh +++ b/model_zoo/official/nlp/mass/scripts/run_gpu.sh @@ -146,7 +146,8 @@ if [ "$task" == "train" ] then if [ $RANK_SIZE -gt 1 ] then - mpirun -n $RANK_SIZE python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 & + mpirun -n $RANK_SIZE --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \ + python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 & fi python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 & elif [ "$task" == "infer" ] diff --git a/model_zoo/official/nlp/mass/src/utils/loss_monitor.py b/model_zoo/official/nlp/mass/src/utils/loss_monitor.py index c62230d6ae3..1d3467363af 100644 --- a/model_zoo/official/nlp/mass/src/utils/loss_monitor.py +++ b/model_zoo/official/nlp/mass/src/utils/loss_monitor.py @@ -33,12 +33,13 @@ class LossCallBack(Callback): time_stamp_init = False time_stamp_first = 0 - def __init__(self, config: TransformerConfig, per_print_times: int = 1): + def __init__(self, config: TransformerConfig, per_print_times: int = 1, rank_id: int = 0): super(LossCallBack, self).__init__() if not isinstance(per_print_times, int) or per_print_times < 0: raise ValueError("print_step must be int and >= 0.") self.config = config self._per_print_times = per_print_times + self.rank_id = rank_id if not self.time_stamp_init: self.time_stamp_first = self._get_ms_timestamp() @@ -46,7 +47,7 @@ class LossCallBack(Callback): def step_end(self, run_context): cb_params = run_context.original_args() - file_name = "./loss.log" + file_name = "./loss_{}.log".format(self.rank_id) with open(file_name, "a+") as f: time_stamp_current = self._get_ms_timestamp() f.write("time: {}, epoch: {}, step: {}, outputs are {},{},{}.\n".format( diff --git a/model_zoo/official/nlp/mass/train.py b/model_zoo/official/nlp/mass/train.py index 80ed331b54f..e78cd1b1da7 100644 --- a/model_zoo/official/nlp/mass/train.py +++ b/model_zoo/official/nlp/mass/train.py @@ -199,24 +199,28 @@ def _build_training_pipeline(config: TransformerConfig, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) - loss_monitor = LossCallBack(config) ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') - callbacks = [loss_monitor] - if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0: - ckpt_callback = ModelCheckpoint( - prefix=config.ckpt_prefix, - directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), - config=ckpt_config) - callbacks.append(ckpt_callback) + callbacks = [] + if rank_size is not None and int(rank_size) > 1: + loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank()) + callbacks.append(loss_monitor) + if MultiAscend.get_rank() % 8 == 0: + ckpt_callback = ModelCheckpoint( + prefix=config.ckpt_prefix, + directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiAscend.get_rank())), + config=ckpt_config) + callbacks.append(ckpt_callback) if rank_size is None or int(rank_size) == 1: ckpt_callback = ModelCheckpoint( prefix=config.ckpt_prefix, directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))), config=ckpt_config) + loss_monitor = LossCallBack(config, rank_id=os.getenv('DEVICE_ID')) + callbacks.append(loss_monitor) callbacks.append(ckpt_callback) print(f" | ALL SET, PREPARE TO TRAIN.") diff --git a/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh index d09f49760da..987d1fe6f85 100644 --- a/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh +++ b/model_zoo/official/nlp/tinybert/scripts/run_distributed_gd_gpu.sh @@ -29,7 +29,7 @@ TEACHER_CKPT_PATH=$5 PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd) -mpirun --allow-run-as-root -n $RANK_SIZE \ +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python ${PROJECT_DIR}/../run_general_distill.py \ --distribute="true" \ --device_target="GPU" \ diff --git a/model_zoo/official/nlp/transformer/train.py b/model_zoo/official/nlp/transformer/train.py index a1d040fad4c..60a8535699c 100644 --- a/model_zoo/official/nlp/transformer/train.py +++ b/model_zoo/official/nlp/transformer/train.py @@ -53,11 +53,12 @@ class LossCallBack(Callback): Args: per_print_times (int): Print loss every times. Default: 1. """ - def __init__(self, per_print_times=1): + def __init__(self, per_print_times=1, rank_id=0): super(LossCallBack, self).__init__() if not isinstance(per_print_times, int) or per_print_times < 0: raise ValueError("print_step must be int and >= 0.") self._per_print_times = per_print_times + self.rank_id = rank_id global time_stamp_init, time_stamp_first if not time_stamp_init: time_stamp_first = get_ms_timestamp() @@ -71,7 +72,7 @@ class LossCallBack(Callback): print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cb_params.cur_step_num, str(cb_params.net_outputs))) - with open("./loss.log", "a+") as f: + with open("./loss_{}.log".fromat(self.rank_id), "a+") as f: f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cb_params.cur_step_num, @@ -145,7 +146,7 @@ def run_transformer_train(): min_lr=cfg.lr_schedule.min_lr), mstype.float32) optimizer = Adam(netwithloss.trainable_params(), lr) - callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()] + callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)] if args.enable_save_ckpt == "true": if device_num == 1 or (device_num > 1 and rank_id == 0): ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps, diff --git a/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh b/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh index 832cc409d48..7cf8f513c6c 100644 --- a/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh +++ b/model_zoo/official/recommend/deepfm/scripts/run_distribute_train_gpu.sh @@ -28,7 +28,7 @@ cp *.py ./log cp -r src ./log cd ./log || exit env > env.log -mpirun --allow-run-as-root -n $RANK_SIZE \ +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python -u train.py \ --dataset_path=$DATA_URL \ --ckpt_path="checkpoint" \ diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh index ae8d31e36ec..9403ad685ee 100644 --- a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh +++ b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train.sh @@ -21,7 +21,7 @@ RANK_SIZE=$1 EPOCH_SIZE=$2 DATASET=$3 -mpirun --allow-run-as-root -n $RANK_SIZE \ +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python -s ${self_path}/../train_and_eval_distribute.py \ --device_target="GPU" \ --data_path=$DATASET \ diff --git a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh index 772e01f767d..475b3184588 100644 --- a/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh +++ b/model_zoo/official/recommend/wide_and_deep/script/run_multigpu_train_host_device.sh @@ -23,7 +23,7 @@ DATASET=$3 VOCAB_SIZE=$4 EMB_DIM=$5 -mpirun --allow-run-as-root -n $RANK_SIZE \ +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ python -s ${self_path}/../train_and_eval_auto_parallel.py \ --device_target="GPU" \ --data_path=$DATASET \