!6299 mobilenetv2 debug for load ckpt

Merge pull request !6299 from yepei6/r0.7_mobilenet_debug
2020-09-18 11:28:16 +08:00 · 2020-09-18 11:28:16 +08:00 · ab997f9e37
parent 5027130939 93c4d2929c
commit ab997f9e37
4 changed files with 26 additions and 19 deletions
--- a/model_zoo/official/cv/mobilenetv2/eval.py
+++ b/model_zoo/official/cv/mobilenetv2/eval.py
@ -46,6 +46,10 @@ if __name__ == '__main__':

    dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=False, config=config)
    step_size = dataset.get_dataset_size()
+    if step_size == 0:
+        raise ValueError("The step_size of dataset is zero. Check if the images count of train dataset is more \
+            than batch_size in config.py")
+
    net.set_train(False)

    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean')
--- a/model_zoo/official/cv/mobilenetv2/src/dataset.py
+++ b/model_zoo/official/cv/mobilenetv2/src/dataset.py
@ -16,7 +16,6 @@
 create train or eval dataset.
 """
 import os
-from tqdm import tqdm
 import numpy as np

 from mindspore import Tensor
@ -109,19 +108,20 @@ def extract_features(net, dataset_path, config):
                             config=config,
                             repeat_num=1)
    step_size = dataset.get_dataset_size()
-    pbar = tqdm(list(dataset.create_dict_iterator()))
+    if step_size == 0:
+        raise ValueError("The step_size of dataset is zero. Check if the images count of train dataset is more \
+            than batch_size in config.py")
+
    model = Model(net)
-    i = 0
-    for data in pbar:
+    for i, data in enumerate(dataset.create_dict_iterator()):
        features_path = os.path.join(features_folder, f"feature_{i}.npy")
        label_path = os.path.join(features_folder, f"label_{i}.npy")
-        if not (os.path.exists(features_path) and os.path.exists(label_path)):
+        if not os.path.exists(features_path) or not os.path.exists(label_path):
            image = data["image"]
            label = data["label"]
            features = model.predict(Tensor(image))
            np.save(features_path, features.asnumpy())
            np.save(label_path, label)
-        pbar.set_description("Process dataset batch: %d" % (i + 1))
-        i += 1
+        print(f"Complete the batch {i}/{step_size}")

    return step_size
--- a/model_zoo/official/cv/mobilenetv2/src/mobilenetV2.py
+++ b/model_zoo/official/cv/mobilenetv2/src/mobilenetV2.py
@ -330,8 +330,12 @@ class MobileNetV2(nn.Cell):
    MobileNetV2 architecture.

    Args:
-        backbone(nn.Cell):
-        head(nn.Cell):
+        class_num (int): number of classes.
+        width_mult (int): Channels multiplier for round to 8/16 and others. Default is 1.
+        has_dropout (bool): Is dropout used. Default is false
+        inverted_residual_setting (list): Inverted residual settings. Default is None
+        round_nearest (list): Channel round to . Default is 8
+
    Returns:
        Tensor, output tensor.

@ -355,14 +359,11 @@ class MobileNetV2(nn.Cell):

 class MobileNetV2Combine(nn.Cell):
    """
-    MobileNetV2 architecture.
+    MobileNetV2Combine architecture.

    Args:
-        class_num (Cell): number of classes.
-        width_mult (int): Channels multiplier for round to 8/16 and others. Default is 1.
-        has_dropout (bool): Is dropout used. Default is false
-        inverted_residual_setting (list): Inverted residual settings. Default is None
-        round_nearest (list): Channel round to . Default is 8
+        backbone(Cell): The features extract layers.
+        head(Cell): The fully connected layer.
    Returns:
        Tensor, output tensor.

@ -371,7 +372,7 @@ class MobileNetV2Combine(nn.Cell):
    """

    def __init__(self, backbone, head):
-        super(MobileNetV2Combine, self).__init__()
+        super(MobileNetV2Combine, self).__init__(auto_prefix=False)
        self.backbone = backbone
        self.head = head

--- a/model_zoo/official/cv/mobilenetv2/train.py
+++ b/model_zoo/official/cv/mobilenetv2/train.py
@ -62,6 +62,9 @@ if __name__ == '__main__':
            raise ValueError("Currently, CPU only support \"incremental_learn\", not \"fine_tune\" or \"train\".")
        dataset = create_dataset(dataset_path=args_opt.dataset_path, do_train=True, config=config)
        step_size = dataset.get_dataset_size()
+        if step_size == 0:
+            raise ValueError("The step_size of dataset is zero. Check if the images count of train dataset is more \
+                than batch_size in config.py")

    # Currently, only Ascend support switch precision.
    switch_precision(net, mstype.float16, config)
@ -108,9 +111,8 @@ if __name__ == '__main__':
                losses.append(network(feature, label).asnumpy())
            epoch_mseconds = (time.time()-epoch_start) * 1000
            per_step_mseconds = epoch_mseconds / step_size
-            print("\r epoch[{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\
-            .format(epoch + 1, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses))), \
-                end="")
+            print("epoch[{}], iter[{}] cost: {:5.3f}, per step time: {:5.3f}, avg loss: {:5.3f}"\
+            .format(epoch + 1, step_size, epoch_mseconds, per_step_mseconds, np.mean(np.array(losses))))
            if (epoch + 1) % config.save_checkpoint_epochs == 0:
                _exec_save_checkpoint(network, os.path.join(config.save_checkpoint_path, \
                    f"mobilenetv2_head_{epoch+1}.ckpt"))