From d85102acfbaf9ff63ac1174791e53bf795998f38 Mon Sep 17 00:00:00 2001
From: gengdongjie <gengdongjie@huawei.com>
Date: Fri, 22 May 2020 23:41:59 +0800
Subject: [PATCH] add pretrained option to resnet50_imagenet

---
 example/resnet50_cifar10/config.py            |  2 +-
 example/resnet50_imagenet2012/README.md       | 12 +++++--
 example/resnet50_imagenet2012/config.py       |  1 +
 example/resnet50_imagenet2012/lr_generator.py |  7 ++--
 .../run_distribute_train.sh                   | 21 ++++++++++--
 .../run_standalone_train.sh                   | 21 ++++++++++--
 example/resnet50_imagenet2012/train.py        | 33 ++++++++++++-------
 7 files changed, 71 insertions(+), 26 deletions(-)

diff --git a/example/resnet50_cifar10/config.py b/example/resnet50_cifar10/config.py
index 8b5e5354253..c148e4329c7 100755
--- a/example/resnet50_cifar10/config.py
+++ b/example/resnet50_cifar10/config.py
@@ -28,7 +28,7 @@ config = ed({
     "image_height": 224,
     "image_width": 224,
     "save_checkpoint": True,
-    "save_checkpoint_steps": 195,
+    "save_checkpoint_steps": 1950,
     "keep_checkpoint_max": 10,
     "save_checkpoint_path": "./",
     "warmup_epochs": 5,
diff --git a/example/resnet50_imagenet2012/README.md b/example/resnet50_imagenet2012/README.md
index ba1742cdffd..05b39daaca0 100644
--- a/example/resnet50_imagenet2012/README.md
+++ b/example/resnet50_imagenet2012/README.md
@@ -45,6 +45,7 @@ Parameters for both training and inference can be set in config.py.
 "momentum": 0.9,                  # momentum optimizer
 "weight_decay": 1e-4,             # weight decay 
 "epoch_size": 90,                 # only valid for taining, which is always 1 for inference 
+"pretrained_epoch_size": 1,       # epoch size that model has been trained before load pretrained checkpoint
 "buffer_size": 1000,              # number of queue size in data preprocessing
 "image_height": 224,              # image height
 "image_width": 224,               # image width
@@ -68,10 +69,11 @@ Parameters for both training and inference can be set in config.py.
 
 ```
 # distributed training
-Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]
+Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
 
 # standalone training
-Usage: sh run_standalone_train.sh [DATASET_PATH]
+Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+
 ```
 
 
@@ -81,8 +83,14 @@ Usage: sh run_standalone_train.sh [DATASET_PATH]
 # distributed training example(8 pcs)
 sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc
 
+# If you want to load pretrained ckpt file
+sh run_distribute_train.sh rank_table_8p.json dataset/ilsvrc ./pretrained.ckpt
+
 # standalone training example(1 pcs)
 sh run_standalone_train.sh dataset/ilsvrc
+
+# If you want to load pretrained ckpt file
+sh run_standalone_train.sh dataset/ilsvrc ./pretrained.ckpt
 ```
 
 > About rank_table.json, you can refer to the [distributed training tutorial](https://www.mindspore.cn/tutorial/en/master/advanced_use/distributed_training.html).
diff --git a/example/resnet50_imagenet2012/config.py b/example/resnet50_imagenet2012/config.py
index 4ba90743625..e33c2b6aa09 100755
--- a/example/resnet50_imagenet2012/config.py
+++ b/example/resnet50_imagenet2012/config.py
@@ -24,6 +24,7 @@ config = ed({
     "momentum": 0.9,
     "weight_decay": 1e-4,
     "epoch_size": 90,
+    "pretrained_epoch_size": 1,
     "buffer_size": 1000,
     "image_height": 224,
     "image_width": 224,
diff --git a/example/resnet50_imagenet2012/lr_generator.py b/example/resnet50_imagenet2012/lr_generator.py
index faf1302ae29..4a57be2f01e 100755
--- a/example/resnet50_imagenet2012/lr_generator.py
+++ b/example/resnet50_imagenet2012/lr_generator.py
@@ -17,12 +17,11 @@ import math
 import numpy as np
 
 
-def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
+def get_lr(lr_init, lr_end, lr_max, warmup_epochs, total_epochs, steps_per_epoch, lr_decay_mode):
     """
     generate learning rate array
 
     Args:
-       global_step(int): total steps of the training
        lr_init(float): init learning rate
        lr_end(float): end learning rate
        lr_max(float): max learning rate
@@ -83,8 +82,6 @@ def get_lr(global_step, lr_init, lr_end, lr_max, warmup_epochs, total_epochs, st
                 lr = lr_max - (lr_max - lr_end) * (i - warmup_steps) / (total_steps - warmup_steps)
             lr_each_step.append(lr)
 
-    current_step = global_step
-    lr_each_step = np.array(lr_each_step).astype(np.float32)
-    learning_rate = lr_each_step[current_step:]
+    learning_rate = np.array(lr_each_step).astype(np.float32)
 
     return learning_rate
diff --git a/example/resnet50_imagenet2012/run_distribute_train.sh b/example/resnet50_imagenet2012/run_distribute_train.sh
index 7a45269cd6b..235a48e9c8f 100755
--- a/example/resnet50_imagenet2012/run_distribute_train.sh
+++ b/example/resnet50_imagenet2012/run_distribute_train.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 2 ]
+if [ $# != 2 ] && [ $# != 3 ]
 then 
-    echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH]"
+	echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi
 
@@ -30,6 +30,10 @@ get_real_path(){
 
 PATH1=$(get_real_path $1)
 PATH2=$(get_real_path $2)
+if [ $# == 3 ]
+then
+  PATH3=$(get_real_path $3)
+fi
 
 if [ ! -f "$PATH1" ]
 then 
@@ -43,6 +47,12 @@ then
 exit 1
 fi 
 
+if [ ! -f "$PATH3" ]
+then 
+    echo "error: PRETRAINED_CKPT_PATH=$PATH3 is not a file"
+exit 1
+fi 
+
 ulimit -u unlimited
 export DEVICE_NUM=8
 export RANK_SIZE=8
@@ -60,6 +70,11 @@ do
     cd ./train_parallel$i || exit
     echo "start training for rank $RANK_ID, device $DEVICE_ID"
     env > env.log
-    python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
+    if [ $# == 2 ]
+    then	    
+        python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
+    else
+	python train.py --do_train=True --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log &
+    fi
     cd ..
 done
diff --git a/example/resnet50_imagenet2012/run_standalone_train.sh b/example/resnet50_imagenet2012/run_standalone_train.sh
index cb08cde6c94..c4dc95b7ebf 100755
--- a/example/resnet50_imagenet2012/run_standalone_train.sh
+++ b/example/resnet50_imagenet2012/run_standalone_train.sh
@@ -14,9 +14,9 @@
 # limitations under the License.
 # ============================================================================
 
-if [ $# != 1 ]
+if [ $# != 1 ] && [ $# != 2 ]
 then 
-    echo "Usage: sh run_standalone_train.sh [DATASET_PATH]"
+    echo "Usage: sh run_standalone_train.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
 exit 1
 fi
 
@@ -29,6 +29,10 @@ get_real_path(){
 }
 
 PATH1=$(get_real_path $1)
+if [ $# == 2 ]
+then
+  PATH2=$(get_real_path $2)
+fi
 
 if [ ! -d "$PATH1" ]
 then 
@@ -36,6 +40,12 @@ then
 exit 1
 fi 
 
+if [ ! -f "$PATH2" ]
+then 
+    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+exit 1
+fi
+
 ulimit -u unlimited
 export DEVICE_NUM=1
 export DEVICE_ID=0
@@ -51,5 +61,10 @@ cp *.sh ./train
 cd ./train || exit
 echo "start training for device $DEVICE_ID"
 env > env.log
-python train.py --do_train=True --dataset_path=$PATH1 &> log &
+if [ $# == 1 ]
+then	    
+    python train.py --do_train=True --dataset_path=$PATH1 &> log &
+else
+    python train.py --do_train=True --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
+fi
 cd ..
diff --git a/example/resnet50_imagenet2012/train.py b/example/resnet50_imagenet2012/train.py
index 9b3fc7573c6..42f19b4d640 100755
--- a/example/resnet50_imagenet2012/train.py
+++ b/example/resnet50_imagenet2012/train.py
@@ -28,6 +28,7 @@ from mindspore.train.model import Model, ParallelMode
 
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
 from mindspore.train.loss_scale_manager import FixedLossScaleManager
+from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.communication.management import init
 import mindspore.nn as nn
 import mindspore.common.initializer as weight_init
@@ -39,6 +40,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.')
 parser.add_argument('--do_train', type=bool, default=True, help='Do train or not.')
 parser.add_argument('--do_eval', type=bool, default=False, help='Do eval or not.')
 parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
+parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
 args_opt = parser.parse_args()
 
 device_id = int(os.getenv('DEVICE_ID'))
@@ -58,15 +60,20 @@ if __name__ == '__main__':
     net = resnet50(class_num=config.class_num)
 
     # weight init
-    for _, cell in net.cells_and_names():
-        if isinstance(cell, nn.Conv2d):
-            cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
-                                                                cell.weight.default_input.shape(),
-                                                                cell.weight.default_input.dtype()).to_tensor()
-        if isinstance(cell, nn.Dense):
-            cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
-                                                                cell.weight.default_input.shape(),
-                                                                cell.weight.default_input.dtype()).to_tensor()
+    if args_opt.pre_trained:
+        param_dict = load_checkpoint(args_opt.pre_trained)
+        load_param_into_net(net, param_dict)
+        epoch_size = config.epoch_size - config.pretrained_epoch_size
+    else:
+        for _, cell in net.cells_and_names():
+            if isinstance(cell, nn.Conv2d):
+                cell.weight.default_input = weight_init.initializer(weight_init.XavierUniform(),
+                                                                    cell.weight.default_input.shape(),
+                                                                    cell.weight.default_input.dtype()).to_tensor()
+            if isinstance(cell, nn.Dense):
+                cell.weight.default_input = weight_init.initializer(weight_init.TruncatedNormal(),
+                                                                    cell.weight.default_input.shape(),
+                                                                    cell.weight.default_input.dtype()).to_tensor()
     if not config.use_label_smooth:
         config.label_smooth_factor = 0.0
 
@@ -78,9 +85,11 @@ if __name__ == '__main__':
         step_size = dataset.get_dataset_size()
 
         loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
-        lr = Tensor(get_lr(global_step=0, lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max,
-                           warmup_epochs=config.warmup_epochs, total_epochs=epoch_size, steps_per_epoch=step_size,
-                           lr_decay_mode='cosine'))
+        lr = get_lr(lr_init=config.lr_init, lr_end=0.0, lr_max=config.lr_max, warmup_epochs=config.warmup_epochs,
+                    total_epochs=config.epoch_size, steps_per_epoch=step_size, lr_decay_mode='cosine')
+        if args_opt.pre_trained:
+            lr = lr[config.pretrained_epoch_size * step_size:]
+        lr = Tensor(lr)
 
         opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), lr, config.momentum,
                        config.weight_decay, config.loss_scale)