forked from mindspore-Ecosystem/mindspore
modify yolov3&ssd shell script
This commit is contained in:
parent
89a86611b4
commit
5d77480c7e
|
@ -14,13 +14,20 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
echo "=============================================================================================================="
|
||||
echo "================================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDSPORE_HCCL_CONFIG_PATH"
|
||||
echo "for example: sh run_distribute_train.sh 8 150 coco /data/hccl.json"
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE DATASET MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||
echo "for example: sh run_distribute_train.sh 8 350 coco /data/hccl.json /opt/ssd-300.ckpt(optional) 200(optional)"
|
||||
echo "It is better to use absolute path."
|
||||
echo "The learning rate is 0.4 as default, if you want other lr, please change the value in this script."
|
||||
echo "=============================================================================================================="
|
||||
echo "================================================================================================================="
|
||||
|
||||
if [ $# != 4 ] && [ $# != 6 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [DATASET] \
|
||||
[MINDSPORE_HCCL_CONFIG_PATH] [PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
# Before start distribute train, first create mindrecord files.
|
||||
python train.py --only_create_dataset=1
|
||||
|
@ -30,6 +37,8 @@ echo "After running the scipt, the network runs in the background. The log will
|
|||
export RANK_SIZE=$1
|
||||
EPOCH_SIZE=$2
|
||||
DATASET=$3
|
||||
PRE_TRAINED=$5
|
||||
PRE_TRAINED_EPOCH_SIZE=$6
|
||||
export MINDSPORE_HCCL_CONFIG_PATH=$4
|
||||
|
||||
|
||||
|
@ -43,12 +52,29 @@ do
|
|||
export RANK_ID=$i
|
||||
echo "start training for rank $i, device $DEVICE_ID"
|
||||
env > env.log
|
||||
python ../train.py \
|
||||
--distribute=1 \
|
||||
--lr=0.4 \
|
||||
--dataset=$DATASET \
|
||||
--device_num=$RANK_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
python ../train.py \
|
||||
--distribute=1 \
|
||||
--lr=0.4 \
|
||||
--dataset=$DATASET \
|
||||
--device_num=$RANK_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
|
||||
fi
|
||||
|
||||
if [ $# == 6 ]
|
||||
then
|
||||
python ../train.py \
|
||||
--distribute=1 \
|
||||
--lr=0.4 \
|
||||
--dataset=$DATASET \
|
||||
--device_num=$RANK_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
--pre_trained=$PRE_TRAINED \
|
||||
--pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
|
||||
--epoch_size=$EPOCH_SIZE > log.txt 2>&1 &
|
||||
fi
|
||||
|
||||
cd ../
|
||||
done
|
||||
|
|
|
@ -88,6 +88,7 @@ def main():
|
|||
parser.add_argument("--epoch_size", type=int, default=70, help="Epoch size, default is 70.")
|
||||
parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
|
||||
parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained Checkpoint file path.")
|
||||
parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size.")
|
||||
parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
|
||||
parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
|
||||
args_opt = parser.parse_args()
|
||||
|
@ -150,17 +151,20 @@ def main():
|
|||
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
|
||||
ckpoint_cb = ModelCheckpoint(prefix="ssd", directory=None, config=ckpt_config)
|
||||
|
||||
lr = Tensor(get_lr(global_step=0, lr_init=0, lr_end=0, lr_max=args_opt.lr,
|
||||
warmup_epochs=max(args_opt.epoch_size // 20, 1),
|
||||
total_epochs=args_opt.epoch_size,
|
||||
if args_opt.pre_trained:
|
||||
if args_opt.pre_trained_epoch_size <= 0:
|
||||
raise KeyError("pre_trained_epoch_size must be greater than 0.")
|
||||
param_dict = load_checkpoint(args_opt.pre_trained)
|
||||
load_param_into_net(net, param_dict)
|
||||
|
||||
lr = Tensor(get_lr(global_step=args_opt.pre_trained_epoch_size * dataset_size,
|
||||
lr_init=0, lr_end=0, lr_max=args_opt.lr,
|
||||
warmup_epochs=max(350 // 20, 1),
|
||||
total_epochs=350,
|
||||
steps_per_epoch=dataset_size))
|
||||
opt = nn.Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, 0.9, 0.0001, loss_scale)
|
||||
net = TrainingWrapper(net, opt, loss_scale)
|
||||
|
||||
if args_opt.pre_trained:
|
||||
param_dict = load_checkpoint(args_opt.pre_trained)
|
||||
load_param_into_net(net, param_dict)
|
||||
|
||||
callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
|
||||
|
||||
model = Model(net)
|
||||
|
|
|
@ -14,18 +14,27 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
echo "=============================================================================================================="
|
||||
echo "======================================================================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH"
|
||||
echo "for example: sh run_distribute_train.sh 8 100 /data/Mindrecord_train /data /data/train.txt /data/hccl.json"
|
||||
echo "sh run_distribute_train.sh DEVICE_NUM EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH MINDSPORE_HCCL_CONFIG_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||
echo "For example: sh run_distribute_train.sh 8 150 /data/Mindrecord_train /data /data/train.txt /data/hccl.json /opt/yolov3-150.ckpt(optional) 100(optional)"
|
||||
echo "It is better to use absolute path."
|
||||
echo "The learning rate is 0.005 as default, if you want other lr, please change the value in this script."
|
||||
echo "=============================================================================================================="
|
||||
echo "======================================================================================================================================================="
|
||||
|
||||
if [ $# != 6 ] && [ $# != 8 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [DEVICE_NUM] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] [MINDSPORE_HCCL_CONFIG_PATH] \
|
||||
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
EPOCH_SIZE=$2
|
||||
MINDRECORD_DIR=$3
|
||||
IMAGE_DIR=$4
|
||||
ANNO_PATH=$5
|
||||
PRE_TRAINED=$7
|
||||
PRE_TRAINED_EPOCH_SIZE=$8
|
||||
|
||||
# Before start distribute train, first create mindrecord files.
|
||||
python train.py --only_create_dataset=1 --mindrecord_dir=$MINDRECORD_DIR --image_dir=$IMAGE_DIR \
|
||||
|
@ -51,14 +60,34 @@ do
|
|||
export RANK_ID=$i
|
||||
echo "start training for rank $i, device $DEVICE_ID"
|
||||
env > env.log
|
||||
taskset -c $cmdopt python ../train.py \
|
||||
--distribute=1 \
|
||||
--lr=0.005 \
|
||||
--device_num=$RANK_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
--mindrecord_dir=$MINDRECORD_DIR \
|
||||
--image_dir=$IMAGE_DIR \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
--anno_path=$ANNO_PATH > log.txt 2>&1 &
|
||||
|
||||
if [ $# == 6 ]
|
||||
then
|
||||
taskset -c $cmdopt python ../train.py \
|
||||
--distribute=1 \
|
||||
--lr=0.005 \
|
||||
--device_num=$RANK_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
--mindrecord_dir=$MINDRECORD_DIR \
|
||||
--image_dir=$IMAGE_DIR \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
--anno_path=$ANNO_PATH > log.txt 2>&1 &
|
||||
fi
|
||||
|
||||
if [ $# == 8 ]
|
||||
then
|
||||
taskset -c $cmdopt python ../train.py \
|
||||
--distribute=1 \
|
||||
--lr=0.005 \
|
||||
--device_num=$RANK_SIZE \
|
||||
--device_id=$DEVICE_ID \
|
||||
--mindrecord_dir=$MINDRECORD_DIR \
|
||||
--image_dir=$IMAGE_DIR \
|
||||
--epoch_size=$EPOCH_SIZE \
|
||||
--pre_trained=$PRE_TRAINED \
|
||||
--pre_trained_epoch_size=$PRE_TRAINED_EPOCH_SIZE \
|
||||
--anno_path=$ANNO_PATH > log.txt 2>&1 &
|
||||
fi
|
||||
|
||||
cd ../
|
||||
done
|
||||
|
|
|
@ -14,10 +14,25 @@
|
|||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
echo "=============================================================================================================="
|
||||
echo "========================================================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH"
|
||||
echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt"
|
||||
echo "=============================================================================================================="
|
||||
echo "sh run_standalone_train.sh DEVICE_ID EPOCH_SIZE MINDRECORD_DIR IMAGE_DIR ANNO_PATH PRE_TRAINED PRE_TRAINED_EPOCH_SIZE"
|
||||
echo "for example: sh run_standalone_train.sh 0 50 ./Mindrecord_train ./dataset ./dataset/train.txt /opt/yolov3-50.ckpt(optional) 30(optional)"
|
||||
echo "========================================================================================================================================="
|
||||
|
||||
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
|
||||
if [ $# != 5 ] && [ $# != 7 ]
|
||||
then
|
||||
echo "Usage: sh run_standalone_train.sh [DEVICE_ID] [EPOCH_SIZE] [MINDRECORD_DIR] [IMAGE_DIR] [ANNO_PATH] \
|
||||
[PRE_TRAINED](optional) [PRE_TRAINED_EPOCH_SIZE](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ $# == 5 ]
|
||||
then
|
||||
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5
|
||||
fi
|
||||
|
||||
if [ $# == 7 ]
|
||||
then
|
||||
python train.py --device_id=$1 --epoch_size=$2 --mindrecord_dir=$3 --image_dir=$4 --anno_path=$5 --pre_trained=$6 --pre_trained_epoch_size=$7
|
||||
fi
|
||||
|
|
|
@ -71,6 +71,7 @@ def main():
|
|||
parser.add_argument("--epoch_size", type=int, default=10, help="Epoch size, default is 10")
|
||||
parser.add_argument("--batch_size", type=int, default=32, help="Batch size, default is 32.")
|
||||
parser.add_argument("--pre_trained", type=str, default=None, help="Pretrained checkpoint file path")
|
||||
parser.add_argument("--pre_trained_epoch_size", type=int, default=0, help="Pretrained epoch size")
|
||||
parser.add_argument("--save_checkpoint_epochs", type=int, default=5, help="Save checkpoint epochs, default is 5.")
|
||||
parser.add_argument("--loss_scale", type=int, default=1024, help="Loss scale, default is 1024.")
|
||||
parser.add_argument("--mindrecord_dir", type=str, default="./Mindrecord_train",
|
||||
|
@ -133,15 +134,20 @@ def main():
|
|||
ckpt_config = CheckpointConfig(save_checkpoint_steps=dataset_size * args_opt.save_checkpoint_epochs)
|
||||
ckpoint_cb = ModelCheckpoint(prefix="yolov3", directory=None, config=ckpt_config)
|
||||
|
||||
lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=0, global_step=args_opt.epoch_size * dataset_size,
|
||||
if args_opt.pre_trained:
|
||||
if args_opt.pre_trained_epoch_size <= 0:
|
||||
raise KeyError("pre_trained_epoch_size must be greater than 0.")
|
||||
param_dict = load_checkpoint(args_opt.pre_trained)
|
||||
load_param_into_net(net, param_dict)
|
||||
total_epoch_size = 60
|
||||
if args_opt.distribute:
|
||||
total_epoch_size = 160
|
||||
lr = Tensor(get_lr(learning_rate=args_opt.lr, start_step=args_opt.pre_trained_epoch_size * dataset_size,
|
||||
global_step=total_epoch_size * dataset_size,
|
||||
decay_step=1000, decay_rate=0.95, steps=True))
|
||||
opt = nn.Adam(filter(lambda x: x.requires_grad, net.get_parameters()), lr, loss_scale=loss_scale)
|
||||
net = TrainingWrapper(net, opt, loss_scale)
|
||||
|
||||
if args_opt.pre_trained:
|
||||
param_dict = load_checkpoint(args_opt.pre_trained)
|
||||
load_param_into_net(net, param_dict)
|
||||
|
||||
callback = [TimeMonitor(data_size=dataset_size), LossMonitor(), ckpoint_cb]
|
||||
|
||||
model = Model(net)
|
||||
|
|
Loading…
Reference in New Issue