diff --git a/example/resnet101_imagenet2012/README.md b/example/resnet101_imagenet2012/README.md index cd2401f7fd3..6578b09f0ec 100644 --- a/example/resnet101_imagenet2012/README.md +++ b/example/resnet101_imagenet2012/README.md @@ -46,6 +46,7 @@ Parameters for both training and evaluating can be set in config.py. "momentum": 0.9, # momentum optimizer "weight_decay": 1e-4, # weight decay "epoch_size": 120, # epoch sizes for training +"pretrain_epoch_size": 0, # epoch size of pretrain checkpoint "buffer_size": 1000, # number of queue size in data preprocessing "image_height": 224, # image height "image_width": 224, # image width @@ -68,10 +69,10 @@ Parameters for both training and evaluating can be set in config.py. ``` # distributed training -sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] +sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional) # standalone training -sh run_standalone_train.sh [DATASET_PATH] +sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional) ``` #### Launch @@ -79,9 +80,15 @@ sh run_standalone_train.sh [DATASET_PATH] ```bash # distributed training example(8p) sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc - + +If you want to load pretrained ckpt file, +sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./ckpt/pretrained.ckpt + # standalone training example(1p) sh run_standalone_train.sh dataset/ilsvrc + +f you want to load pretrained ckpt file, +sh run_standalone_train.sh dataset/ilsvrc ./ckpt/pretrained.ckpt ``` > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html). diff --git a/example/resnet101_imagenet2012/config.py b/example/resnet101_imagenet2012/config.py index 0b9f16b504e..5f07014ad35 100755 --- a/example/resnet101_imagenet2012/config.py +++ b/example/resnet101_imagenet2012/config.py @@ -24,6 +24,7 @@ config = ed({ "momentum": 0.9, "weight_decay": 1e-4, "epoch_size": 120, + "pretrain_epoch_size": 0, "buffer_size": 1000, "image_height": 224, "image_width": 224, diff --git a/example/resnet101_imagenet2012/lr_generator.py b/example/resnet101_imagenet2012/lr_generator.py index 88cb85cc5b3..2392e7a7bf8 100755 --- a/example/resnet101_imagenet2012/lr_generator.py +++ b/example/resnet101_imagenet2012/lr_generator.py @@ -21,7 +21,7 @@ def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr): lr = float(init_lr) + lr_inc * current_step return lr -def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): +def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch=120, global_step=0): """ generate learning rate array with cosine @@ -30,6 +30,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): steps_per_epoch(int): steps size of one epoch warmup_epochs(int): number of warmup epochs max_epoch(int): total epochs of training + global_step(int): the current start index of lr array Returns: np.array, learning rate array """ @@ -49,4 +50,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch): decayed = linear_decay * cosine_decay + 0.00001 lr = base_lr * decayed lr_each_step.append(lr) - return np.array(lr_each_step).astype(np.float32) + + lr_each_step = np.array(lr_each_step).astype(np.float32) + learning_rate = lr_each_step[global_step:] + return learning_rate diff --git a/example/resnet101_imagenet2012/run_distribute_train.sh b/example/resnet101_imagenet2012/run_distribute_train.sh index ecdcd66859d..8f8021202d4 100755 --- a/example/resnet101_imagenet2012/run_distribute_train.sh +++ b/example/resnet101_imagenet2012/run_distribute_train.sh @@ -14,9 +14,9 @@ # limitations under the License. # ============================================================================ -if [ $# != 2 ] +if [ $# != 2 ] && [ $# != 3 ] then - echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]" + echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)" exit 1 fi @@ -31,6 +31,11 @@ PATH1=$(get_real_path $1) PATH2=$(get_real_path $2) echo $PATH1 echo $PATH2 +if [ $# == 3 ] +then + PATH3=$(get_real_path $3) + echo $PATH3 +fi if [ ! -f $PATH1 ] then @@ -44,6 +49,12 @@ then exit 1 fi +if [ $# == 3 ] && [ ! -f $PATH3 ] +then + echo "error: PRETRAINED_PATH=$PATH3 is not a file" +exit 1 +fi + ulimit -u unlimited export DEVICE_NUM=8 export RANK_SIZE=8 @@ -61,6 +72,15 @@ do cd ./train_parallel$i || exit echo "start training for rank $RANK_ID, device $DEVICE_ID" env > env.log - python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & + if [ $# == 2 ] + then + python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log & + fi + + if [ $# == 3 ] + then + python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log & + fi + cd .. done diff --git a/example/resnet101_imagenet2012/run_standalone_train.sh b/example/resnet101_imagenet2012/run_standalone_train.sh index dde018b8eb2..7db8b5d7bcc 100755 --- a/example/resnet101_imagenet2012/run_standalone_train.sh +++ b/example/resnet101_imagenet2012/run_standalone_train.sh @@ -14,9 +14,9 @@ # limitations under the License. # ============================================================================ -if [ $# != 1 ] +if [ $# != 1 ] && [ $# != 2 ] then - echo "Usage: sh run_standalone_train.sh [DATASET_PATH]" + echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)" exit 1 fi @@ -29,12 +29,23 @@ get_real_path(){ } PATH1=$(get_real_path $1) echo $PATH1 +if [ $# == 2 ] +then + PATH2=$(get_real_path $2) + echo $PATH2 +fi if [ ! -d $PATH1 ] then echo "error: DATASET_PATH=$PATH1 is not a directory" exit 1 -fi +fi + +if [ $# == 2 ] && [ ! -f $PATH2 ] +then + echo "error: PRETRAINED_PATH=$PATH2 is not a file" +exit 1 +fi ulimit -u unlimited export DEVICE_NUM=1 @@ -52,5 +63,13 @@ cp *.sh ./train cd ./train || exit echo "start training for device $DEVICE_ID" env > env.log -python train.py --do_train=True --dataset_path=$PATH1 &> log & +if [ $# == 1 ] +then + python train.py --do_train=True --dataset_path=$PATH1 &> log & +fi + +if [ $# == 2 ] +then + python train.py --do_train=True --dataset_path=$PATH1 --pre_trained=$PATH2 &> log & +fi cd .. diff --git a/example/resnet101_imagenet2012/train.py b/example/resnet101_imagenet2012/train.py index 1401a340005..756cf2cdd86 100755 --- a/example/resnet101_imagenet2012/train.py +++ b/example/resnet101_imagenet2012/train.py @@ -44,6 +44,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.') parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.') parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.') parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path') +parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path') args_opt = parser.parse_args() device_id = int(os.getenv('DEVICE_ID')) @@ -77,9 +78,13 @@ if __name__ == '__main__': repeat_num=epoch_size, batch_size=config.batch_size) step_size = dataset.get_dataset_size() loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False) + if args_opt.pre_trained: + param_dict = load_checkpoint(args_opt.pre_trained) + load_param_into_net(net, param_dict) # learning rate strategy with cosine - lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size)) + lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, 120, + config.pretrain_epoch_size*step_size)) opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum, config.weight_decay, config.loss_scale) model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False,