modify yolov3&ssd shell script

This commit is contained in:
chengxianbin 2020-05-25 10:34:46 +08:00
parent 89a86611b4
commit 5d77480c7e
5 changed files with 121 additions and 41 deletions

View File

@ -14,13 +14,20 @@
# limitations under the License.
# ============================================================================
echo "=============================================================================================================="
echo "================================================================================================================="
echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: sh run_distribute_train.sh 8 150 coco /data/hccl.json"
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "for example: sh run_distribute_train.sh 8 350 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
echo "It is better to use absolute path."
echo "The learning rate is 0.4 as default, if you want other lr, please change the value in this script."
echo "=============================================================================================================="
echo "================================================================================================================="
if [ $# != 4 ] && [ $# != 6 ]
then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [DATASET] \
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1
fi
# Before start distribute train, first create mindrecord files.
python train.py --only_create_dataset=1
@ -30,6 +37,8 @@ echo "After running the scipt, the network runs in the background. The log will
export RANK_SIZE=$1
EPOCH_SIZE=$2
DATASET=$3
PRE_TRAINED=$5
PRE_TRAINED_EPOCH_SIZE=$6
export MINDSPORE_HCCL_CONFIG_PATH=$4
@ -43,12 +52,29 @@ do
export RANK_ID=$i
echo "start training for rank $i, device $DEVICE_ID"
env > env.log
python ../train.py \
--distribute=1 \
--lr=0.4 \
--dataset=$DATASET \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
if [ $# == 4 ]
then
python ../train.py \
--distribute=1 \
--lr=0.4 \
--dataset=$DATASET \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
fi
if [ $# == 6 ]
then
python ../train.py \
--distribute=1 \
--lr=0.4 \
--dataset=$DATASET \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--pre_trained=$PRE_TRAINED \
--pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
fi
cd ../
done

View File

@ -88,6 +88,7 @@ def main():
parser.add_argument("--epoch_size", type=int, default=70, help="Epoch size, default is 70.")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.")
parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.")
parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
args_opt = parser.parse_args()
@ -150,17 +151,20 @@ def main():
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=None, config=ckpt_config)
lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=args_opt.lr,
warmup_epochs=max(args_opt.epoch_size // 20, 1),
total_epochs=args_opt.epoch_size,
if args_opt.pre_trained:
if args_opt.pre_trained_epoch_size <= 0:
raise KeyError("pre_trained_epoch_size must be greater than 0.")
param_dict = load_checkpoint(args_opt.pre_trained)
load_param_into_net(net, param_dict)
lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size,
lr_init=0, lr_end=0, lr_max=args_opt.lr,
warmup_epochs=max(350 // 20, 1),
total_epochs=350,
steps_per_epoch=dataset_size))
opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale)
net = TrainingWrapper(net, opt, loss_scale)
if args_opt.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained)
load_param_into_net(net, param_dict)
callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
model = Model(net)

View File

@ -14,18 +14,27 @@
# limitations under the License.
# ============================================================================
echo "=============================================================================================================="
echo "======================================================================================================================================================="
echo "Please run the scipt as: "
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH"
echo "for example: sh run_distribute_train.sh 8 100 /data/Mindrecord_train /data /data/train.txt /data/hccl.json"
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
echo "It is better to use absolute path."
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
echo "=============================================================================================================="
echo "======================================================================================================================================================="
if [ $# != 6 ] && [ $# != 8 ]
then
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1
fi
EPOCH_SIZE=$2
MINDRECORD_DIR=$3
IMAGE_DIR=$4
ANNO_PATH=$5
PRE_TRAINED=$7
PRE_TRAINED_EPOCH_SIZE=$8
# Before start distribute train, first create mindrecord files.
python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR \
@ -51,14 +60,34 @@ do
export RANK_ID=$i
echo "start training for rank $i, device $DEVICE_ID"
env > env.log
taskset -c $cmdopt python ../train.py \
--distribute=1 \
--lr=0.005 \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--mindrecord_dir=$MINDRECORD_DIR \
--image_dir=$IMAGE_DIR \
--epoch_size=$EPOCH_SIZE \
--anno_path=$ANNO_PATH > log.txt 2>&1 &
if [ $# == 6 ]
then
taskset -c $cmdopt python ../train.py \
--distribute=1 \
--lr=0.005 \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--mindrecord_dir=$MINDRECORD_DIR \
--image_dir=$IMAGE_DIR \
--epoch_size=$EPOCH_SIZE \
--anno_path=$ANNO_PATH > log.txt 2>&1 &
fi
if [ $# == 8 ]
then
taskset -c $cmdopt python ../train.py \
--distribute=1 \
--lr=0.005 \
--device_num=$RANK_SIZE \
--device_id=$DEVICE_ID \
--mindrecord_dir=$MINDRECORD_DIR \
--image_dir=$IMAGE_DIR \
--epoch_size=$EPOCH_SIZE \
--pre_trained=$PRE_TRAINED \
--pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
--anno_path=$ANNO_PATH > log.txt 2>&1 &
fi
cd ../
done

View File

@ -14,10 +14,25 @@
# limitations under the License.
# ============================================================================
echo "=============================================================================================================="
echo "========================================================================================================================================="
echo "Please run the scipt as: "
echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH"
echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt"
echo "=============================================================================================================="
echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)"
echo "========================================================================================================================================="
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
if [ $# != 5 ] && [ $# != 7 ]
then
echo "Usage: sh run_standalone_train.sh [DEVICE_ID] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] \
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
exit 1
fi
if [ $# == 5 ]
then
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
fi
if [ $# == 7 ]
then
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7
fi

View File

@ -71,6 +71,7 @@ def main():
parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10")
parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path")
parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size")
parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train",
@ -133,15 +134,20 @@ def main():
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config)
lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=0, global_step=args_opt.epoch_size * dataset_size,
if args_opt.pre_trained:
if args_opt.pre_trained_epoch_size <= 0:
raise KeyError("pre_trained_epoch_size must be greater than 0.")
param_dict = load_checkpoint(args_opt.pre_trained)
load_param_into_net(net, param_dict)
total_epoch_size = 60
if args_opt.distribute:
total_epoch_size = 160
lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size,
global_step=total_epoch_size * dataset_size,
decay_step=1000, decay_rate=0.95, steps=True))
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
net = TrainingWrapper(net, opt, loss_scale)
if args_opt.pre_trained:
param_dict = load_checkpoint(args_opt.pre_trained)
load_param_into_net(net, param_dict)
callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
model = Model(net)