forked from OSSInnovation/mindspore
!5768 Save the GPU backend multi card output in different folders.
Merge pull request !5768 from linqingke/fasterrcnn
This commit is contained in:
commit
be62fd7fa6
|
@ -154,7 +154,7 @@ sh run_distribute_train_ascend.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
|
||||||
|
|
||||||
### Result
|
### Result
|
||||||
|
|
||||||
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log.
|
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log.
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -39,7 +39,7 @@ class LossCallBack(Callback):
|
||||||
per_print_times (int): Print loss every times. Default: 1.
|
per_print_times (int): Print loss every times. Default: 1.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, per_print_times=1):
|
def __init__(self, per_print_times=1, rank_id=0):
|
||||||
super(LossCallBack, self).__init__()
|
super(LossCallBack, self).__init__()
|
||||||
if not isinstance(per_print_times, int) or per_print_times < 0:
|
if not isinstance(per_print_times, int) or per_print_times < 0:
|
||||||
raise ValueError("print_step must be int and >= 0.")
|
raise ValueError("print_step must be int and >= 0.")
|
||||||
|
@ -51,6 +51,7 @@ class LossCallBack(Callback):
|
||||||
self.rpn_reg_loss_sum = 0
|
self.rpn_reg_loss_sum = 0
|
||||||
self.rcnn_cls_loss_sum = 0
|
self.rcnn_cls_loss_sum = 0
|
||||||
self.rcnn_reg_loss_sum = 0
|
self.rcnn_reg_loss_sum = 0
|
||||||
|
self.rank_id = rank_id
|
||||||
|
|
||||||
global time_stamp_init, time_stamp_first
|
global time_stamp_init, time_stamp_first
|
||||||
if not time_stamp_init:
|
if not time_stamp_init:
|
||||||
|
@ -91,7 +92,7 @@ class LossCallBack(Callback):
|
||||||
|
|
||||||
total_loss = rpn_loss + rcnn_loss
|
total_loss = rpn_loss + rcnn_loss
|
||||||
|
|
||||||
loss_file = open("./loss.log", "a+")
|
loss_file = open("./loss_{}.log".format(self.rank_id), "a+")
|
||||||
loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, "
|
loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, "
|
||||||
"rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, total_loss: %.5f" %
|
"rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, total_loss: %.5f" %
|
||||||
(time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch,
|
(time_stamp_current - time_stamp_first, cb_params.cur_epoch_num, cur_step_in_epoch,
|
||||||
|
|
|
@ -131,7 +131,7 @@ if __name__ == '__main__':
|
||||||
net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale)
|
net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale)
|
||||||
|
|
||||||
time_cb = TimeMonitor(data_size=dataset_size)
|
time_cb = TimeMonitor(data_size=dataset_size)
|
||||||
loss_cb = LossCallBack()
|
loss_cb = LossCallBack(rank_id=rank)
|
||||||
cb = [time_cb, loss_cb]
|
cb = [time_cb, loss_cb]
|
||||||
if config.save_checkpoint:
|
if config.save_checkpoint:
|
||||||
ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
|
ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
|
||||||
|
|
|
@ -44,7 +44,7 @@ export CUDA_VISIBLE_DEVICES="$2"
|
||||||
|
|
||||||
if [ $1 -gt 1 ]
|
if [ $1 -gt 1 ]
|
||||||
then
|
then
|
||||||
mpirun -n $1 --allow-run-as-root \
|
mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python3 ${BASEPATH}/../train.py > train.log 2>&1 &
|
python3 ${BASEPATH}/../train.py > train.log 2>&1 &
|
||||||
else
|
else
|
||||||
python3 ${BASEPATH}/../train.py > train.log 2>&1 &
|
python3 ${BASEPATH}/../train.py > train.log 2>&1 &
|
||||||
|
|
|
@ -14,4 +14,5 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
DATA_DIR=$1
|
DATA_DIR=$1
|
||||||
mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
|
mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \
|
||||||
|
python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
|
||||||
|
|
|
@ -329,7 +329,7 @@ sh run_distribute_train.sh [RANK_TABLE_FILE] [PRETRAINED_MODEL]
|
||||||
|
|
||||||
### [Training Result](#content)
|
### [Training Result](#content)
|
||||||
|
|
||||||
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss.log.
|
Training result will be stored in the example path, whose folder name begins with "train" or "train_parallel". You can find checkpoint file together with result like the followings in loss_rankid.log.
|
||||||
|
|
||||||
|
|
||||||
```
|
```
|
||||||
|
|
|
@ -40,7 +40,7 @@ class LossCallBack(Callback):
|
||||||
per_print_times (int): Print loss every times. Default: 1.
|
per_print_times (int): Print loss every times. Default: 1.
|
||||||
"""
|
"""
|
||||||
|
|
||||||
def __init__(self, per_print_times=1):
|
def __init__(self, per_print_times=1, rank_id=0):
|
||||||
super(LossCallBack, self).__init__()
|
super(LossCallBack, self).__init__()
|
||||||
if not isinstance(per_print_times, int) or per_print_times < 0:
|
if not isinstance(per_print_times, int) or per_print_times < 0:
|
||||||
raise ValueError("print_step must be int and >= 0.")
|
raise ValueError("print_step must be int and >= 0.")
|
||||||
|
@ -53,6 +53,7 @@ class LossCallBack(Callback):
|
||||||
self.rcnn_cls_loss_sum = 0
|
self.rcnn_cls_loss_sum = 0
|
||||||
self.rcnn_reg_loss_sum = 0
|
self.rcnn_reg_loss_sum = 0
|
||||||
self.rcnn_mask_loss_sum = 0
|
self.rcnn_mask_loss_sum = 0
|
||||||
|
self.rank_id = rank_id
|
||||||
|
|
||||||
global time_stamp_init, time_stamp_first
|
global time_stamp_init, time_stamp_first
|
||||||
if not time_stamp_init:
|
if not time_stamp_init:
|
||||||
|
@ -96,7 +97,7 @@ class LossCallBack(Callback):
|
||||||
|
|
||||||
total_loss = rpn_loss + rcnn_loss
|
total_loss = rpn_loss + rcnn_loss
|
||||||
|
|
||||||
loss_file = open("./loss.log", "a+")
|
loss_file = open("./loss_{}.log".format(self.rank_id), "a+")
|
||||||
loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, "
|
loss_file.write("%lu epoch: %s step: %s ,rpn_loss: %.5f, rcnn_loss: %.5f, rpn_cls_loss: %.5f, "
|
||||||
"rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, rcnn_mask_loss: %.5f, "
|
"rpn_reg_loss: %.5f, rcnn_cls_loss: %.5f, rcnn_reg_loss: %.5f, rcnn_mask_loss: %.5f, "
|
||||||
"total_loss: %.5f" %
|
"total_loss: %.5f" %
|
||||||
|
|
|
@ -126,7 +126,7 @@ if __name__ == '__main__':
|
||||||
net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale)
|
net = TrainOneStepCell(net_with_loss, net, opt, sens=config.loss_scale)
|
||||||
|
|
||||||
time_cb = TimeMonitor(data_size=dataset_size)
|
time_cb = TimeMonitor(data_size=dataset_size)
|
||||||
loss_cb = LossCallBack()
|
loss_cb = LossCallBack(rank_id=rank)
|
||||||
cb = [time_cb, loss_cb]
|
cb = [time_cb, loss_cb]
|
||||||
if config.save_checkpoint:
|
if config.save_checkpoint:
|
||||||
ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
|
ckptconfig = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * dataset_size,
|
||||||
|
|
|
@ -72,7 +72,7 @@ run_gpu()
|
||||||
cd ../train || exit
|
cd ../train || exit
|
||||||
|
|
||||||
export CUDA_VISIBLE_DEVICES="$3"
|
export CUDA_VISIBLE_DEVICES="$3"
|
||||||
mpirun -n $2 --allow-run-as-root \
|
mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python ${BASEPATH}/../train.py \
|
python ${BASEPATH}/../train.py \
|
||||||
--platform=$1 \
|
--platform=$1 \
|
||||||
--dataset_path=$4 \
|
--dataset_path=$4 \
|
||||||
|
|
|
@ -167,14 +167,14 @@ run_gpu(){
|
||||||
env > env.log
|
env > env.log
|
||||||
if [ $# == 3 ]
|
if [ $# == 3 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n ${RANK_SIZE} \
|
mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --device_target=$1 --dataset_path=$PATH1 &> train.log &
|
python train.py --device_target=$1 --dataset_path=$PATH1 &> train.log &
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $# == 4 ]
|
if [ $# == 4 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n ${RANK_SIZE} \
|
mpirun --allow-run-as-root -n ${RANK_SIZE} --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --device_target=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log &
|
python train.py --device_traget=$1 --dataset_path=$PATH1 --pre_trained=$PATH2 &> train.log &
|
||||||
fi
|
fi
|
||||||
|
|
||||||
cd ..
|
cd ..
|
||||||
|
|
|
@ -38,7 +38,7 @@ run_gpu()
|
||||||
cd ../train || exit
|
cd ../train || exit
|
||||||
|
|
||||||
export CUDA_VISIBLE_DEVICES="$3"
|
export CUDA_VISIBLE_DEVICES="$3"
|
||||||
mpirun -n $2 --allow-run-as-root \
|
mpirun -n $2 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python ${BASEPATH}/../train.py \
|
python ${BASEPATH}/../train.py \
|
||||||
--dataset_path=$4 \
|
--dataset_path=$4 \
|
||||||
--device_target=$1 \
|
--device_target=$1 \
|
||||||
|
|
|
@ -14,4 +14,5 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
DATA_DIR=$1
|
DATA_DIR=$1
|
||||||
mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
|
mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout \
|
||||||
|
python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
|
||||||
|
|
|
@ -80,14 +80,14 @@ cd ./train_parallel || exit
|
||||||
|
|
||||||
if [ $# == 3 ]
|
if [ $# == 3 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $# == 4 ]
|
if [ $# == 4 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -86,14 +86,14 @@ cp -r ../src ./sched
|
||||||
cd ./sched || exit
|
cd ./sched || exit
|
||||||
if [ $# == 3 ]
|
if [ $# == 3 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n 1 \
|
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> sched.log &
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $# == 4 ]
|
if [ $# == 4 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n 1 \
|
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> sched.log &
|
||||||
fi
|
fi
|
||||||
|
@ -111,14 +111,14 @@ do
|
||||||
cd ./server_$i || exit
|
cd ./server_$i || exit
|
||||||
if [ $# == 3 ]
|
if [ $# == 3 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n 1 \
|
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> server_$i.log &
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $# == 4 ]
|
if [ $# == 4 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n 1 \
|
mpirun --allow-run-as-root -n 1 --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> server_$i.log &
|
||||||
fi
|
fi
|
||||||
|
@ -134,14 +134,14 @@ cp -r ../src ./worker
|
||||||
cd ./worker || exit
|
cd ./worker || exit
|
||||||
if [ $# == 3 ]
|
if [ $# == 3 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True &> worker.log &
|
||||||
fi
|
fi
|
||||||
|
|
||||||
if [ $# == 4 ]
|
if [ $# == 4 ]
|
||||||
then
|
then
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
python train.py --net=$1 --dataset=$2 --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 --parameter_server=True --pre_trained=$PATH2 &> worker.log &
|
||||||
fi
|
fi
|
||||||
|
|
|
@ -41,6 +41,7 @@ cp *.sh ./train_parallel
|
||||||
cp -r ../src ./train_parallel
|
cp -r ../src ./train_parallel
|
||||||
cd ./train_parallel || exit
|
cd ./train_parallel || exit
|
||||||
|
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py --run_distribute=True \
|
python train.py --run_distribute=True \
|
||||||
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
|
--device_num=$DEVICE_NUM --device_target="GPU" --dataset_path=$PATH1 &> log &
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,7 @@ then
|
||||||
PATH_CHECKPOINT=$2
|
PATH_CHECKPOINT=$2
|
||||||
fi
|
fi
|
||||||
|
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py \
|
python train.py \
|
||||||
--is_distribute=1 \
|
--is_distribute=1 \
|
||||||
--platform="GPU" \
|
--platform="GPU" \
|
||||||
|
|
|
@ -54,14 +54,14 @@ Dataset used: [imagenet](http://www.image-net.org/)
|
||||||
+-- ShuffleNetV2
|
+-- ShuffleNetV2
|
||||||
+-- Readme.md # descriptions about ShuffleNetV2
|
+-- Readme.md # descriptions about ShuffleNetV2
|
||||||
+-- scripts
|
+-- scripts
|
||||||
¦ +--run_distribute_train_for_gpu.sh # shell script for distributed training
|
+--run_distribute_train_for_gpu.sh # shell script for distributed training
|
||||||
¦ +--run_eval_for_gpu.sh # shell script for evaluation
|
+--run_eval_for_gpu.sh # shell script for evaluation
|
||||||
¦ +--run_standalone_train_for_gpu.sh # shell script for standalone training
|
+--run_standalone_train_for_gpu.sh # shell script for standalone training
|
||||||
+-- src
|
+-- src
|
||||||
¦ +--config.py # parameter configuration
|
+--config.py # parameter configuration
|
||||||
¦ +--dataset.py # creating dataset
|
+--dataset.py # creating dataset
|
||||||
¦ +--loss.py # loss function for network
|
+--loss.py # loss function for network
|
||||||
¦ +--lr_generator.py # learning rate config
|
+--lr_generator.py # learning rate config
|
||||||
+-- train.py # training script
|
+-- train.py # training script
|
||||||
+-- eval.py # evaluation script
|
+-- eval.py # evaluation script
|
||||||
+-- blocks.py # ShuffleNetV2 blocks
|
+-- blocks.py # ShuffleNetV2 blocks
|
||||||
|
@ -83,7 +83,7 @@ You can start training using python or shell scripts. The usage of shell scripts
|
||||||
```
|
```
|
||||||
# training example
|
# training example
|
||||||
python:
|
python:
|
||||||
GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 &
|
GPU: mpirun --allow-run-as-root -n 8 --output-filename log_output --merge-stderr-to-stdout python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 &
|
||||||
|
|
||||||
shell:
|
shell:
|
||||||
GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
|
GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
|
||||||
|
|
|
@ -50,7 +50,7 @@ export CUDA_VISIBLE_DEVICES="$2"
|
||||||
|
|
||||||
if [ $# == 3 ]
|
if [ $# == 3 ]
|
||||||
then
|
then
|
||||||
mpirun -n $1 --allow-run-as-root \
|
mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 &
|
python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 &
|
||||||
fi
|
fi
|
||||||
|
|
||||||
|
|
|
@ -22,7 +22,8 @@ echo "==========================================================================
|
||||||
|
|
||||||
DATA_PATH=$1
|
DATA_PATH=$1
|
||||||
|
|
||||||
mpirun -n 8 python train.py \
|
mpirun -n 8 --output-filename log_output --merge-stderr-to-stdout \
|
||||||
|
python train.py \
|
||||||
--device_target="GPU" \
|
--device_target="GPU" \
|
||||||
--dataset="imagenet2012" \
|
--dataset="imagenet2012" \
|
||||||
--is_distributed=1 \
|
--is_distributed=1 \
|
||||||
|
|
|
@ -44,7 +44,7 @@ cp ../*.py ./distribute_train
|
||||||
cp -r ../src ./distribute_train
|
cp -r ../src ./distribute_train
|
||||||
cd ./distribute_train || exit
|
cd ./distribute_train || exit
|
||||||
|
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python train.py \
|
python train.py \
|
||||||
--dataset_path=$DATASET_PATH \
|
--dataset_path=$DATASET_PATH \
|
||||||
--platform=GPU \
|
--platform=GPU \
|
||||||
|
|
|
@ -53,7 +53,8 @@ cp ../*.py ./train_parallel
|
||||||
cp -r ../src ./train_parallel
|
cp -r ../src ./train_parallel
|
||||||
cd ./train_parallel || exit
|
cd ./train_parallel || exit
|
||||||
env > env.log
|
env > env.log
|
||||||
mpirun --allow-run-as-root -n ${DEVICE_NUM} python train.py \
|
mpirun --allow-run-as-root -n ${DEVICE_NUM} --output-filename log_output --merge-stderr-to-stdout \
|
||||||
|
python train.py \
|
||||||
--data_dir=$DATASET_PATH \
|
--data_dir=$DATASET_PATH \
|
||||||
--pretrained_backbone=$PRETRAINED_BACKBONE \
|
--pretrained_backbone=$PRETRAINED_BACKBONE \
|
||||||
--device_target=GPU \
|
--device_target=GPU \
|
||||||
|
|
|
@ -26,7 +26,7 @@ EPOCH_SIZE=$2
|
||||||
DATA_DIR=$3
|
DATA_DIR=$3
|
||||||
SCHEMA_DIR=$4
|
SCHEMA_DIR=$4
|
||||||
|
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python run_pretrain.py \
|
python run_pretrain.py \
|
||||||
--device_target="GPU" \
|
--device_target="GPU" \
|
||||||
--distribute="true" \
|
--distribute="true" \
|
||||||
|
|
|
@ -146,7 +146,8 @@ if [ "$task" == "train" ]
|
||||||
then
|
then
|
||||||
if [ $RANK_SIZE -gt 1 ]
|
if [ $RANK_SIZE -gt 1 ]
|
||||||
then
|
then
|
||||||
mpirun -n $RANK_SIZE python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
|
mpirun -n $RANK_SIZE --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
|
||||||
|
python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
|
||||||
fi
|
fi
|
||||||
python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
|
python train.py --config ${configurations##*/} --platform GPU >>log.log 2>&1 &
|
||||||
elif [ "$task" == "infer" ]
|
elif [ "$task" == "infer" ]
|
||||||
|
|
|
@ -33,12 +33,13 @@ class LossCallBack(Callback):
|
||||||
time_stamp_init = False
|
time_stamp_init = False
|
||||||
time_stamp_first = 0
|
time_stamp_first = 0
|
||||||
|
|
||||||
def __init__(self, config: TransformerConfig, per_print_times: int = 1):
|
def __init__(self, config: TransformerConfig, per_print_times: int = 1, rank_id: int = 0):
|
||||||
super(LossCallBack, self).__init__()
|
super(LossCallBack, self).__init__()
|
||||||
if not isinstance(per_print_times, int) or per_print_times < 0:
|
if not isinstance(per_print_times, int) or per_print_times < 0:
|
||||||
raise ValueError("print_step must be int and >= 0.")
|
raise ValueError("print_step must be int and >= 0.")
|
||||||
self.config = config
|
self.config = config
|
||||||
self._per_print_times = per_print_times
|
self._per_print_times = per_print_times
|
||||||
|
self.rank_id = rank_id
|
||||||
|
|
||||||
if not self.time_stamp_init:
|
if not self.time_stamp_init:
|
||||||
self.time_stamp_first = self._get_ms_timestamp()
|
self.time_stamp_first = self._get_ms_timestamp()
|
||||||
|
@ -46,7 +47,7 @@ class LossCallBack(Callback):
|
||||||
|
|
||||||
def step_end(self, run_context):
|
def step_end(self, run_context):
|
||||||
cb_params = run_context.original_args()
|
cb_params = run_context.original_args()
|
||||||
file_name = "./loss.log"
|
file_name = "./loss_{}.log".format(self.rank_id)
|
||||||
with open(file_name, "a+") as f:
|
with open(file_name, "a+") as f:
|
||||||
time_stamp_current = self._get_ms_timestamp()
|
time_stamp_current = self._get_ms_timestamp()
|
||||||
f.write("time: {}, epoch: {}, step: {}, outputs are {},{},{}.\n".format(
|
f.write("time: {}, epoch: {}, step: {}, outputs are {},{},{}.\n".format(
|
||||||
|
|
|
@ -199,24 +199,28 @@ def _build_training_pipeline(config: TransformerConfig,
|
||||||
scale_update_cell=scale_manager.get_update_cell())
|
scale_update_cell=scale_manager.get_update_cell())
|
||||||
net_with_grads.set_train(True)
|
net_with_grads.set_train(True)
|
||||||
model = Model(net_with_grads)
|
model = Model(net_with_grads)
|
||||||
loss_monitor = LossCallBack(config)
|
|
||||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps,
|
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps,
|
||||||
keep_checkpoint_max=config.keep_ckpt_max)
|
keep_checkpoint_max=config.keep_ckpt_max)
|
||||||
|
|
||||||
rank_size = os.getenv('RANK_SIZE')
|
rank_size = os.getenv('RANK_SIZE')
|
||||||
callbacks = [loss_monitor]
|
callbacks = []
|
||||||
if rank_size is not None and int(rank_size) > 1 and MultiAscend.get_rank() % 8 == 0:
|
if rank_size is not None and int(rank_size) > 1:
|
||||||
ckpt_callback = ModelCheckpoint(
|
loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank())
|
||||||
prefix=config.ckpt_prefix,
|
callbacks.append(loss_monitor)
|
||||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
|
if MultiAscend.get_rank() % 8 == 0:
|
||||||
config=ckpt_config)
|
ckpt_callback = ModelCheckpoint(
|
||||||
callbacks.append(ckpt_callback)
|
prefix=config.ckpt_prefix,
|
||||||
|
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(MultiAscend.get_rank())),
|
||||||
|
config=ckpt_config)
|
||||||
|
callbacks.append(ckpt_callback)
|
||||||
|
|
||||||
if rank_size is None or int(rank_size) == 1:
|
if rank_size is None or int(rank_size) == 1:
|
||||||
ckpt_callback = ModelCheckpoint(
|
ckpt_callback = ModelCheckpoint(
|
||||||
prefix=config.ckpt_prefix,
|
prefix=config.ckpt_prefix,
|
||||||
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
|
directory=os.path.join(config.ckpt_path, 'ckpt_{}'.format(os.getenv('DEVICE_ID'))),
|
||||||
config=ckpt_config)
|
config=ckpt_config)
|
||||||
|
loss_monitor = LossCallBack(config, rank_id=os.getenv('DEVICE_ID'))
|
||||||
|
callbacks.append(loss_monitor)
|
||||||
callbacks.append(ckpt_callback)
|
callbacks.append(ckpt_callback)
|
||||||
|
|
||||||
print(f" | ALL SET, PREPARE TO TRAIN.")
|
print(f" | ALL SET, PREPARE TO TRAIN.")
|
||||||
|
|
|
@ -29,7 +29,7 @@ TEACHER_CKPT_PATH=$5
|
||||||
|
|
||||||
PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
|
PROJECT_DIR=$(cd "$(dirname "$0")" || exit; pwd)
|
||||||
|
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python ${PROJECT_DIR}/../run_general_distill.py \
|
python ${PROJECT_DIR}/../run_general_distill.py \
|
||||||
--distribute="true" \
|
--distribute="true" \
|
||||||
--device_target="GPU" \
|
--device_target="GPU" \
|
||||||
|
|
|
@ -53,11 +53,12 @@ class LossCallBack(Callback):
|
||||||
Args:
|
Args:
|
||||||
per_print_times (int): Print loss every times. Default: 1.
|
per_print_times (int): Print loss every times. Default: 1.
|
||||||
"""
|
"""
|
||||||
def __init__(self, per_print_times=1):
|
def __init__(self, per_print_times=1, rank_id=0):
|
||||||
super(LossCallBack, self).__init__()
|
super(LossCallBack, self).__init__()
|
||||||
if not isinstance(per_print_times, int) or per_print_times < 0:
|
if not isinstance(per_print_times, int) or per_print_times < 0:
|
||||||
raise ValueError("print_step must be int and >= 0.")
|
raise ValueError("print_step must be int and >= 0.")
|
||||||
self._per_print_times = per_print_times
|
self._per_print_times = per_print_times
|
||||||
|
self.rank_id = rank_id
|
||||||
global time_stamp_init, time_stamp_first
|
global time_stamp_init, time_stamp_first
|
||||||
if not time_stamp_init:
|
if not time_stamp_init:
|
||||||
time_stamp_first = get_ms_timestamp()
|
time_stamp_first = get_ms_timestamp()
|
||||||
|
@ -71,7 +72,7 @@ class LossCallBack(Callback):
|
||||||
print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
|
print("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
|
||||||
cb_params.cur_epoch_num, cb_params.cur_step_num,
|
cb_params.cur_epoch_num, cb_params.cur_step_num,
|
||||||
str(cb_params.net_outputs)))
|
str(cb_params.net_outputs)))
|
||||||
with open("./loss.log", "a+") as f:
|
with open("./loss_{}.log".fromat(self.rank_id), "a+") as f:
|
||||||
f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
|
f.write("time: {}, epoch: {}, step: {}, outputs are {}".format(time_stamp_current - time_stamp_first,
|
||||||
cb_params.cur_epoch_num,
|
cb_params.cur_epoch_num,
|
||||||
cb_params.cur_step_num,
|
cb_params.cur_step_num,
|
||||||
|
@ -145,7 +146,7 @@ def run_transformer_train():
|
||||||
min_lr=cfg.lr_schedule.min_lr), mstype.float32)
|
min_lr=cfg.lr_schedule.min_lr), mstype.float32)
|
||||||
optimizer = Adam(netwithloss.trainable_params(), lr)
|
optimizer = Adam(netwithloss.trainable_params(), lr)
|
||||||
|
|
||||||
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack()]
|
callbacks = [TimeMonitor(dataset.get_dataset_size()), LossCallBack(rank_id=rank_id)]
|
||||||
if args.enable_save_ckpt == "true":
|
if args.enable_save_ckpt == "true":
|
||||||
if device_num == 1 or (device_num > 1 and rank_id == 0):
|
if device_num == 1 or (device_num > 1 and rank_id == 0):
|
||||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
|
ckpt_config = CheckpointConfig(save_checkpoint_steps=args.save_checkpoint_steps,
|
||||||
|
|
|
@ -28,7 +28,7 @@ cp *.py ./log
|
||||||
cp -r src ./log
|
cp -r src ./log
|
||||||
cd ./log || exit
|
cd ./log || exit
|
||||||
env > env.log
|
env > env.log
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python -u train.py \
|
python -u train.py \
|
||||||
--dataset_path=$DATA_URL \
|
--dataset_path=$DATA_URL \
|
||||||
--ckpt_path="checkpoint" \
|
--ckpt_path="checkpoint" \
|
||||||
|
|
|
@ -21,7 +21,7 @@ RANK_SIZE=$1
|
||||||
EPOCH_SIZE=$2
|
EPOCH_SIZE=$2
|
||||||
DATASET=$3
|
DATASET=$3
|
||||||
|
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python -s ${self_path}/../train_and_eval_distribute.py \
|
python -s ${self_path}/../train_and_eval_distribute.py \
|
||||||
--device_target="GPU" \
|
--device_target="GPU" \
|
||||||
--data_path=$DATASET \
|
--data_path=$DATASET \
|
||||||
|
|
|
@ -23,7 +23,7 @@ DATASET=$3
|
||||||
VOCAB_SIZE=$4
|
VOCAB_SIZE=$4
|
||||||
EMB_DIM=$5
|
EMB_DIM=$5
|
||||||
|
|
||||||
mpirun --allow-run-as-root -n $RANK_SIZE \
|
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||||
python -s ${self_path}/../train_and_eval_auto_parallel.py \
|
python -s ${self_path}/../train_and_eval_auto_parallel.py \
|
||||||
--device_target="GPU" \
|
--device_target="GPU" \
|
||||||
--data_path=$DATASET \
|
--data_path=$DATASET \
|
||||||
|
|
Loading…
Reference in New Issue