From 7cdc647afa3f4905548369e80319b9d72af8e246 Mon Sep 17 00:00:00 2001 From: gengdongjie Date: Wed, 21 Oct 2020 19:47:34 +0800 Subject: [PATCH] fix issues --- model_zoo/official/cv/maskrcnn/README.md | 84 ++++++++++--------- .../maskrcnn/scripts/run_distribute_train.sh | 1 + model_zoo/official/cv/maskrcnn/src/config.py | 1 - model_zoo/official/cv/resnet/README.md | 75 ++++++++++++++--- 4 files changed, 109 insertions(+), 52 deletions(-) diff --git a/model_zoo/official/cv/maskrcnn/README.md b/model_zoo/official/cv/maskrcnn/README.md index 959a4dd5693..28feb1fa093 100644 --- a/model_zoo/official/cv/maskrcnn/README.md +++ b/model_zoo/official/cv/maskrcnn/README.md @@ -19,8 +19,8 @@ - [Evaluation Result](#evaluation-result) - [Model Description](#model-description) - [Performance](#performance) - - [Training Performance](#training-performance) - [Evaluation Performance](#evaluation-performance) + - [Inference Performance](#inference-performance) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) @@ -280,7 +280,7 @@ Usage: sh run_standalone_train.sh [PRETRAINED_MODEL] "save_checkpoint": True, # whether save checkpoint or not "save_checkpoint_epochs": 1, # save checkpoint interval "keep_checkpoint_max": 12, # max number of saved checkpoint -"save_checkpoint_path": "./checkpoint", # path of checkpoint +"save_checkpoint_path": "./", # path of checkpoint "mindrecord_dir": "/home/maskrcnn/MindRecord_COCO2017_Train", # path of mindrecord "coco_root": "/home/maskrcnn/", # path of coco root dateset @@ -336,13 +336,13 @@ Training result will be stored in the example path, whose folder name begins wit ``` # distribute training result(8p) -epoch: 1 step: 7393 ,rpn_loss: 0.10626, rcnn_loss: 0.81592, rpn_cls_loss: 0.05862, rpn_reg_loss: 0.04761, rcnn_cls_loss: 0.32642, rcnn_reg_loss: 0.15503, rcnn_mask_loss: 0.33447, total_loss: 0.92218 -epoch: 2 step: 7393 ,rpn_loss: 0.00911, rcnn_loss: 0.34082, rpn_cls_loss: 0.00341, rpn_reg_loss: 0.00571, rcnn_cls_loss: 0.07440, rcnn_reg_loss: 0.05872, rcnn_mask_loss: 0.20764, total_loss: 0.34993 -epoch: 3 step: 7393 ,rpn_loss: 0.02087, rcnn_loss: 0.98633, rpn_cls_loss: 0.00665, rpn_reg_loss: 0.01422, rcnn_cls_loss: 0.35913, rcnn_reg_loss: 0.21375, rcnn_mask_loss: 0.41382, total_loss: 1.00720 +epoch: 1 step: 7393 ,rpn_loss: 0.05716, rcnn_loss: 0.81152, rpn_cls_loss: 0.04828, rpn_reg_loss: 0.00889, rcnn_cls_loss: 0.28784, rcnn_reg_loss: 0.17590, rcnn_mask_loss: 0.34790, total_loss: 0.86868 +epoch: 2 step: 7393 ,rpn_loss: 0.00434, rcnn_loss: 0.36572, rpn_cls_loss: 0.00339, rpn_reg_loss: 0.00095, rcnn_cls_loss: 0.08240, rcnn_reg_loss: 0.05554, rcnn_mask_loss: 0.22778, total_loss: 0.37006 +epoch: 3 step: 7393 ,rpn_loss: 0.00996, rcnn_loss: 0.83789, rpn_cls_loss: 0.00701, rpn_reg_loss: 0.00294, rcnn_cls_loss: 0.39478, rcnn_reg_loss: 0.14917, rcnn_mask_loss: 0.29370, total_loss: 0.84785 ... -epoch: 10 step: 7393 ,rpn_loss: 0.02122, rcnn_loss: 0.55176, rpn_cls_loss: 0.00620, rpn_reg_loss: 0.01503, rcnn_cls_loss: 0.12708, rcnn_reg_loss: 0.10254, rcnn_mask_loss: 0.32227, total_loss: 0.57298 -epoch: 11 step: 7393 ,rpn_loss: 0.03772, rcnn_loss: 0.60791, rpn_cls_loss: 0.03058, rpn_reg_loss: 0.00713, rcnn_cls_loss: 0.23987, rcnn_reg_loss: 0.11743, rcnn_mask_loss: 0.25049, total_loss: 0.64563 -epoch: 12 step: 7393 ,rpn_loss: 0.06482, rcnn_loss: 0.47681, rpn_cls_loss: 0.04770, rpn_reg_loss: 0.01709, rcnn_cls_loss: 0.16492, rcnn_reg_loss: 0.04990, rcnn_mask_loss: 0.26196, total_loss: 0.54163 +epoch: 10 step: 7393 ,rpn_loss: 0.00667, rcnn_loss: 0.65625, rpn_cls_loss: 0.00536, rpn_reg_loss: 0.00131, rcnn_cls_loss: 0.17590, rcnn_reg_loss: 0.16199, rcnn_mask_loss: 0.31812, total_loss: 0.66292 +epoch: 11 step: 7393 ,rpn_loss: 0.02003, rcnn_loss: 0.52051, rpn_cls_loss: 0.01761, rpn_reg_loss: 0.00241, rcnn_cls_loss: 0.16028, rcnn_reg_loss: 0.08411, rcnn_mask_loss: 0.27588, total_loss: 0.54054 +epoch: 12 step: 7393 ,rpn_loss: 0.00547, rcnn_loss: 0.39258, rpn_cls_loss: 0.00285, rpn_reg_loss: 0.00262, rcnn_cls_loss: 0.08002, rcnn_reg_loss: 0.04990, rcnn_mask_loss: 0.26245, total_loss: 0.39804 ``` ## [Evaluation Process](#contents) @@ -364,39 +364,39 @@ Inference result will be stored in the example path, whose folder name is "eval" ``` Evaluate annotation type *bbox* Accumulating evaluation results... - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.376 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.598 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.405 - Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.239 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.414 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.475 + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.378 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.602 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.407 + Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.242 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.417 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.480 Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.311 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.500 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.528 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.371 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.572 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.653 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.497 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.524 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.363 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.567 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.647 Evaluate annotation type *segm* Accumulating evaluation results... - Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.326 - Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.553 - Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.344 + Average Precision (AP) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.335 + Average Precision (AP) @[ IoU=0.50 | area= all | maxDets=100 ] = 0.557 + Average Precision (AP) @[ IoU=0.75 | area= all | maxDets=100 ] = 0.351 Average Precision (AP) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.169 - Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.356 - Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.462 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.278 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.426 - Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.445 - Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.294 - Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.484 - Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.558 + Average Precision (AP) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.365 + Average Precision (AP) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.480 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 1 ] = 0.284 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets= 10 ] = 0.433 + Average Recall (AR) @[ IoU=0.50:0.95 | area= all | maxDets=100 ] = 0.451 + Average Recall (AR) @[ IoU=0.50:0.95 | area= small | maxDets=100 ] = 0.285 + Average Recall (AR) @[ IoU=0.50:0.95 | area=medium | maxDets=100 ] = 0.490 + Average Recall (AR) @[ IoU=0.50:0.95 | area= large | maxDets=100 ] = 0.586 ``` # Model Description ## Performance -### Training Performance +### Evaluation Performance | Parameters | MaskRCNN | | -------------------------- | ----------------------------------------------------------- | @@ -407,14 +407,18 @@ Accumulating evaluation results... | Dataset | COCO2017 | | Training Parameters | epoch=12, batch_size = 2 | | Optimizer | SGD | -| Loss Function | Softmax Cross Entropy ,Sigmoid Cross Entropy,SmoothL1Loss | -| Speed | 1pc: 250 ms/step; 8pcs: 260 ms/step | -| Total time | 1pc: 52 hours; 8pcs: 6.6 hours | -| Parameters (M) | 280 | -| Scripts | https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/maskrcnn | +| Loss Function | Softmax Cross Entropy, Sigmoid Cross Entropy, SmoothL1Loss | +| Output | Probability | +| Loss | 0.39804 | +| Speed | 1pc: 193 ms/step; 8pcs: 207 ms/step | +| Total time | 1pc: 46 hours; 8pcs: 5.38 hours | +| Parameters (M) | 84.8 | +| Checkpoint for Fine tuning | 85M(.ckpt file) | +| Model for inference | 571M(.air file) | +| Scripts | [maskrcnn script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/maskrcnn) | -### Evaluation Performance +### Inference Performance | Parameters | MaskRCNN | | ------------------- | --------------------------- | @@ -425,12 +429,12 @@ Accumulating evaluation results... | Dataset | COCO2017 | | batch_size | 2 | | outputs | mAP | -| Accuracy | IoU=0.50:0.95 32.4% | -| Model for inference | 254M (.ckpt file) | +| Accuracy | IoU=0.50:0.95 (BoundingBox 37.0%, Mask 33.5) | +| Model for inference | 170M (.ckpt file) | # [Description of Random Situation](#contents) In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py for weight initialization. # [ModelZoo Homepage](#contents) -Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). \ No newline at end of file +Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo). diff --git a/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh b/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh index ab4a172f6e7..26a8ad7a4fe 100644 --- a/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/maskrcnn/scripts/run_distribute_train.sh @@ -46,6 +46,7 @@ exit 1 fi ulimit -u unlimited +export HCCL_CONNECT_TIMEOUT=600 export DEVICE_NUM=8 export RANK_SIZE=8 export RANK_TABLE_FILE=$PATH1 diff --git a/model_zoo/official/cv/maskrcnn/src/config.py b/model_zoo/official/cv/maskrcnn/src/config.py index 27a04c92b0a..3ff93790d38 100644 --- a/model_zoo/official/cv/maskrcnn/src/config.py +++ b/model_zoo/official/cv/maskrcnn/src/config.py @@ -126,7 +126,6 @@ config = ed({ "warmup_step": 500, "warmup_mode": "linear", "warmup_ratio": 1/3.0, - "sgd_step": [8, 11], "sgd_momentum": 0.9, # train diff --git a/model_zoo/official/cv/resnet/README.md b/model_zoo/official/cv/resnet/README.md index e7b1c8f0bcf..88a52ee7bcc 100644 --- a/model_zoo/official/cv/resnet/README.md +++ b/model_zoo/official/cv/resnet/README.md @@ -15,6 +15,7 @@ - [Model Description](#model-description) - [Performance](#performance) - [Evaluation Performance](#evaluation-performance) + - [Inference Performance](#inference-performance) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) @@ -136,9 +137,11 @@ sh run_eval_gpu.sh [resnet50|resnet101] [cifar10|imagenet2012] [DATASET_PATH] [C ├── src ├── config.py # parameter configuration ├── dataset.py # data preprocessing - ├── CrossEntropySmooth.py # loss definition for ImageNet2012 dataset + ├── CrossEntropySmooth.py # loss definition for ImageNet2012 dataset ├── lr_generator.py # generate learning rate for each step └── resnet.py # resnet backbone, including resnet50 and resnet101 and se-resnet50 + ├── export.py # export model for inference + ├── mindspore_hub_conf.py # mindspore hub interface ├── eval.py # eval net └── train.py # train net ``` @@ -184,7 +187,7 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint_path": "./", # path to save checkpoint relative to the executed path "warmup_epochs": 0, # number of warmup epoch "lr_decay_mode": "Linear", # decay mode for generating learning rate -"use_label_smooth": True, # label smooth +"use_label_smooth": True, # label smooth "label_smooth_factor": 0.1, # label smooth factor "lr_init": 0, # initial learning rate "lr_max": 0.8, # maximum learning rate @@ -207,7 +210,7 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint_path": "./", # path to save checkpoint relative to the executed path "warmup_epochs": 0, # number of warmup epoch "lr_decay_mode": "cosine" # decay mode for generating learning rate -"use_label_smooth": True, # label_smooth +"use_label_smooth": True, # label_smooth "label_smooth_factor": 0.1, # label_smooth_factor "lr": 0.1 # base learning rate ``` @@ -229,7 +232,7 @@ Parameters for both training and evaluation can be set in config.py. "save_checkpoint_path": "./", # path to save checkpoint relative to the executed path "warmup_epochs": 3, # number of warmup epoch "lr_decay_mode": "cosine" # decay mode for generating learning rate -"use_label_smooth": True, # label_smooth +"use_label_smooth": True, # label_smooth "label_smooth_factor": 0.1, # label_smooth_factor "lr_init": 0.0, # initial learning rate "lr_max": 0.3, # maximum learning rate @@ -313,18 +316,13 @@ epoch: 5 step: 5004, loss is 3.1978393 - Training ResNet101 with ImageNet2012 dataset ``` -# distribute training result(8p) +# distribute training result(8 pcs) epoch: 1 step: 5004, loss is 4.805483 epoch: 2 step: 5004, loss is 3.2121816 epoch: 3 step: 5004, loss is 3.429647 epoch: 4 step: 5004, loss is 3.3667371 epoch: 5 step: 5004, loss is 3.1718972 ... -epoch: 67 step: 5004, loss is 2.2768745 -epoch: 68 step: 5004, loss is 1.7223864 -epoch: 69 step: 5004, loss is 2.0665488 -epoch: 70 step: 5004, loss is 1.8717369 -... ``` - Training SE-ResNet50 with ImageNet2012 dataset @@ -457,7 +455,7 @@ result: {'top_5_accuracy': 0.9342589628681178, 'top_1_accuracy': 0.7680657810499 | -------------------------- | ------------------------------------------------------------------------ | | Model Version | SE-ResNet50 | | Resource | Ascend 910,CPU 2.60GHz 192cores,Memory 755G | -| uploaded Date | 08/16/2020 (month/day/year) ; | +| uploaded Date | 08/16/2020 (month/day/year) | | MindSpore Version | 0.7.0-alpha | | Dataset | ImageNet2012 | | Training Parameters | epoch=24, steps per epoch=5004, batch_size = 32 | @@ -471,6 +469,61 @@ result: {'top_5_accuracy': 0.9342589628681178, 'top_1_accuracy': 0.7680657810499 | Checkpoint for Fine tuning | 215.9M (.ckpt file) | | Scripts | [Link](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/resnet) | +### Inference Performance + +#### ResNet50 on CIFAR-10 +| Parameters | Ascend | GPU | +| ------------------- | --------------------------- | --------------------------- | +| Model Version | ResNet50-v1.5 | ResNet50-v1.5 | +| Resource | Ascend 910 | GPU | +| Uploaded Date | 04/01/2020 (month/day/year) | 08/01/2020 (month/day/year) | +| MindSpore Version | 0.1.0-alpha | 0.6.0-alpha | +| Dataset | CIFAR-10 | CIFAR-10 | +| batch_size | 32 | 32 | +| outputs | probability | probability | +| Accuracy | 91.44% | 91.37% | +| Model for inference | 91M (.air file) | | + +#### ResNet50 on ImageNet2012 +| Parameters | Ascend | GPU | +| ------------------- | --------------------------- | --------------------------- | +| Model Version | ResNet50-v1.5 | ResNet50-v1.5 | +| Resource | Ascend 910 | GPU | +| Uploaded Date | 04/01/2020 (month/day/year) | 08/01/2020 (month/day/year) | +| MindSpore Version | 0.1.0-alpha | 0.6.0-alpha | +| Dataset | ImageNet2012 | ImageNet2012 | +| batch_size | 256 | 32 | +| outputs | probability | probability | +| Accuracy | 76.70% | 76.74% | +| Model for inference | 98M (.air file) | | + +#### ResNet101 on ImageNet2012 +| Parameters | Ascend | GPU | +| ------------------- | --------------------------- | --------------------------- | +| Model Version | ResNet101 | ResNet101 | +| Resource | Ascend 910 | GPU | +| Uploaded Date | 04/01/2020 (month/day/year) | 08/01/2020 (month/day/year) | +| MindSpore Version | 0.1.0-alpha | 0.6.0-alpha | +| Dataset | ImageNet2012 | ImageNet2012 | +| batch_size | 32 | 32 | +| outputs | probability | probability | +| Accuracy | 78.53% | 78.64% | +| Model for inference | 171M (.air file) | | + +#### SE-ResNet50 on ImageNet2012 +| Parameters | Ascend | +| ------------------- | --------------------------- | +| Model Version | SE-ResNet50 | +| Resource | Ascend 910 | +| Uploaded Date | 08/16/2020 (month/day/year) | +| MindSpore Version | 0.7.0-alpha | +| Dataset | ImageNet2012 | +| batch_size | 32 | +| outputs | probability | +| Accuracy | 76.80% | +| Model for inference | 109M (.air file) | + + # [Description of Random Situation](#contents) In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py.