diff --git a/example/resnet101_imagenet2012/README.md b/example/resnet101_imagenet2012/README.md
index cd2401f7fd3..6578b09f0ec 100644
--- a/example/resnet101_imagenet2012/README.md
+++ b/example/resnet101_imagenet2012/README.md
@@ -46,6 +46,7 @@ Parameters for both training and evaluating can be set in config.py.
 "momentum": 0.9,                  # momentum optimizer
 "weight_decay": 1e-4,             # weight decay
 "epoch_size": 120,                # epoch sizes for training
+"pretrain_epoch_size": 0,         # epoch size of pretrain checkpoint
 "buffer_size": 1000,              # number of queue size in data preprocessing
 "image_height": 224,              # image height
 "image_width": 224,               # image width
@@ -68,10 +69,10 @@ Parameters for both training and evaluating can be set in config.py.
 
 ```
 # distributed training
-sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
+sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)
  
 # standalone training
-sh run_standalone_train.sh [DATASET_PATH]
+sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)
 ```
  
 #### Launch
@@ -79,9 +80,15 @@ sh run_standalone_train.sh [DATASET_PATH]
 ```bash
 # distributed training example(8p)
 sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
- 
+
+If you want to load pretrained ckpt file, 
+sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./ckpt/pretrained.ckpt
+
 # standalone training example（1p）
 sh run_standalone_train.sh dataset/ilsvrc
+
+f you want to load pretrained ckpt file,
+sh run_standalone_train.sh dataset/ilsvrc ./ckpt/pretrained.ckpt
 ```
  
 > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
diff --git a/example/resnet101_imagenet2012/config.py b/example/resnet101_imagenet2012/config.py
index 0b9f16b504e..5f07014ad35 100755
--- a/example/resnet101_imagenet2012/config.py
+++ b/example/resnet101_imagenet2012/config.py
@@ -24,6 +24,7 @@ config = ed({
     "momentum": 0.9,
     "weight_decay": 1e-4,
     "epoch_size": 120,
+    "pretrain_epoch_size": 0,
     "buffer_size": 1000,
     "image_height": 224,
     "image_width": 224,
diff --git a/example/resnet101_imagenet2012/lr_generator.py b/example/resnet101_imagenet2012/lr_generator.py
index 88cb85cc5b3..2392e7a7bf8 100755
--- a/example/resnet101_imagenet2012/lr_generator.py
+++ b/example/resnet101_imagenet2012/lr_generator.py
@@ -21,7 +21,7 @@ def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr):
     lr = float(init_lr) + lr_inc * current_step
     return lr
 
-def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
+def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch=120, global_step=0):
     """
     generate learning rate array with cosine
 
@@ -30,6 +30,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
        steps_per_epoch(int): steps size of one epoch
        warmup_epochs(int): number of warmup epochs
        max_epoch(int): total epochs of training
+       global_step(int): the current start index of lr array
     Returns:
        np.array, learning rate array
     """
@@ -49,4 +50,7 @@ def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch):
             decayed = linear_decay * cosine_decay + 0.00001
             lr = base_lr * decayed
         lr_each_step.append(lr)
-    return np.array(lr_each_step).astype(np.float32)
+
+    lr_each_step = np.array(lr_each_step).astype(np.float32)
+    learning_rate = lr_each_step[global_step:]
+    return learning_rate
diff --git a/example/resnet101_imagenet2012/run_distribute_train.sh b/example/resnet101_imagenet2012/run_distribute_train.sh
index ecdcd66859d..8f8021202d4 100755
--- a/example/resnet101_imagenet2012/run_distribute_train.sh
+++ b/example/resnet101_imagenet2012/run_distribute_train.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 2 ]
+if [ $# != 2 ] && [ $# != 3 ]
 then 
-    echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]"
+    echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_PATH](optional)"
 exit 1
 fi
 
@@ -31,6 +31,11 @@ PATH1=$(get_real_path $1)
 PATH2=$(get_real_path $2)
 echo $PATH1
 echo $PATH2
+if [ $# == 3 ]
+then 
+    PATH3=$(get_real_path $3)
+    echo $PATH3
+fi
 
 if [ ! -f $PATH1 ]
 then 
@@ -44,6 +49,12 @@ then
 exit 1
 fi 
 
+if [ $# == 3 ] && [ ! -f $PATH3 ]
+then
+    echo "error: PRETRAINED_PATH=$PATH3 is not a file"
+exit 1
+fi
+
 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
@@ -61,6 +72,15 @@ do
     cd ./train_parallel$i || exit
     echo "start training for rank $RANK_ID, device $DEVICE_ID"
     env > env.log
-    python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
+    if [ $# == 2 ]
+    then	    
+        python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
+    fi
+    
+    if [ $# == 3 ]
+    then
+        python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log &
+    fi
+
     cd ..
 done
diff --git a/example/resnet101_imagenet2012/run_standalone_train.sh b/example/resnet101_imagenet2012/run_standalone_train.sh
index dde018b8eb2..7db8b5d7bcc 100755
--- a/example/resnet101_imagenet2012/run_standalone_train.sh
+++ b/example/resnet101_imagenet2012/run_standalone_train.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 1 ]
+if [ $# != 1 ] && [ $# != 2 ]
 then 
-    echo "Usage: sh run_standalone_train.sh [DATASET_PATH]"
+    echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_PATH](optional)"
 exit 1
 fi
 
@@ -29,12 +29,23 @@ get_real_path(){
 }
 PATH1=$(get_real_path $1)
 echo $PATH1
+if [ $# == 2 ]
+then
+    PATH2=$(get_real_path $2)
+    echo $PATH2
+fi
 
 if [ ! -d $PATH1 ]
 then 
     echo "error: DATASET_PATH=$PATH1 is not a directory"
 exit 1
-fi 
+fi
+
+if [ $# == 2 ] && [ ! -f $PATH2 ]
+then
+    echo "error: PRETRAINED_PATH=$PATH2 is not a file"
+exit 1
+fi
 
 ulimit -u unlimited
 export DEVICE_NUM=1
@@ -52,5 +63,13 @@ cp *.sh ./train
 cd ./train || exit
 echo "start training for device $DEVICE_ID"
 env > env.log
-python train.py --do_train=True --dataset_path=$PATH1 &> log &
+if [ $# == 1 ]
+then
+    python train.py --do_train=True --dataset_path=$PATH1 &> log &
+fi
+
+if [ $# == 2 ]
+then
+    python train.py --do_train=True --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
+fi
 cd ..
diff --git a/example/resnet101_imagenet2012/train.py b/example/resnet101_imagenet2012/train.py
index 1401a340005..756cf2cdd86 100755
--- a/example/resnet101_imagenet2012/train.py
+++ b/example/resnet101_imagenet2012/train.py
@@ -44,6 +44,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.')
 parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
 parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
 parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
+parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
 args_opt = parser.parse_args()
 
 device_id = int(os.getenv('DEVICE_ID'))
@@ -77,9 +78,13 @@ if __name__ == '__main__':
                                  repeat_num=epoch_size, batch_size=config.batch_size)
         step_size = dataset.get_dataset_size()
         loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
+        if args_opt.pre_trained:
+            param_dict = load_checkpoint(args_opt.pre_trained)
+            load_param_into_net(net, param_dict)
 
         # learning rate strategy with cosine
-        lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, config.epoch_size))
+        lr = Tensor(warmup_cosine_annealing_lr(config.lr, step_size, config.warmup_epochs, 120,
+                                               config.pretrain_epoch_size*step_size))
         opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
                        config.weight_decay, config.loss_scale)
         model = Model(net, loss_fn=loss, optimizer=opt, amp_level='O2', keep_batchnorm_fp32=False,