forked from mindspore-Ecosystem/mindspore
!12226 fix cnn & efficientnet bugs
From: @pandoublefeng Reviewed-by: @liangchenghui,@c_34 Signed-off-by: @c_34
This commit is contained in:
commit
e489b67a3a
|
@ -15,7 +15,7 @@
|
|||
# ============================================================================
|
||||
if [ $# != 2 ] && [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
echo "Usage: sh run_distribute_train.sh [RANK_TABLE_FILE] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
|
@ -76,13 +76,13 @@ do
|
|||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 &> log &
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 > train.log 2>&1 &
|
||||
fi
|
||||
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 &> log &
|
||||
python train.py --run_distribute=True --device_num=$DEVICE_NUM --dataset_path=$PATH2 --pre_trained=$PATH3 > train.log 2>&1 &
|
||||
fi
|
||||
|
||||
cd ..
|
||||
done
|
||||
done
|
||||
|
|
|
@ -57,6 +57,6 @@ cd ./eval || exit
|
|||
echo "start evaluation for device $DEVICE_ID"
|
||||
env > env.log
|
||||
|
||||
python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 #&> log &
|
||||
python eval.py --dataset_path=$PATH1 --checkpoint_path=$PATH2 > eval.log 2>&1 &
|
||||
|
||||
cd ..
|
||||
cd ..
|
||||
|
|
|
@ -66,7 +66,7 @@ fi
|
|||
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
python train.py --dataset_path=$PATH1 --pre_trained=$PATH2 &> log &
|
||||
python train.py --dataset_path=$PATH1 --pre_trained=$PATH2 > train.log 2>&1 &
|
||||
fi
|
||||
|
||||
cd ..
|
||||
cd ..
|
||||
|
|
|
@ -41,6 +41,7 @@ parser.add_argument('--device_num', type=int, default=1, help='Device num.')
|
|||
parser.add_argument('--dataset_path', type=str, default=None, help='Dataset path')
|
||||
parser.add_argument('--device_target', type=str, default='Ascend', help='Device target')
|
||||
parser.add_argument('--pre_trained', type=str, default=None, help='Pretrained checkpoint path')
|
||||
parser.add_argument('--is_save_on_master', type=int, default=1, help='save ckpt on master or all rank')
|
||||
|
||||
args_opt = parser.parse_args()
|
||||
|
||||
|
@ -70,6 +71,13 @@ if __name__ == '__main__':
|
|||
context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
|
||||
init()
|
||||
|
||||
args_opt.rank_save_ckpt_flag = 0
|
||||
if args_opt.is_save_on_master:
|
||||
if rank_id == 0:
|
||||
args_opt.rank_save_ckpt_flag = 1
|
||||
else:
|
||||
args_opt.rank_save_ckpt_flag = 1
|
||||
|
||||
# create dataset
|
||||
dataset_name = config.dataset_name
|
||||
dataset = create_dataset_train(args_opt.dataset_path + "/" + dataset_name +
|
||||
|
@ -100,10 +108,11 @@ if __name__ == '__main__':
|
|||
loss_cb = LossMonitor()
|
||||
cb = [time_cb, loss_cb]
|
||||
if config.save_checkpoint:
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps,
|
||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
ckpt_cb = ModelCheckpoint(prefix="cnn_direction_model", directory=ckpt_save_dir, config=config_ck)
|
||||
cb += [ckpt_cb]
|
||||
if args_opt.rank_save_ckpt_flag == 1:
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_steps,
|
||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
ckpt_cb = ModelCheckpoint(prefix="cnn_direction_model", directory=ckpt_save_dir, config=config_ck)
|
||||
cb += [ckpt_cb]
|
||||
|
||||
# train model
|
||||
model.train(config.epoch_size, dataset, callbacks=cb, dataset_sink_mode=False)
|
||||
|
|
|
@ -18,7 +18,6 @@
|
|||
|
||||
# [EfficientNet-B0 Description](#contents)
|
||||
|
||||
|
||||
[Paper](https://arxiv.org/abs/1905.11946): Mingxing Tan, Quoc V. Le. EfficientNet: Rethinking Model Scaling for Convolutional Neural Networks. 2019.
|
||||
|
||||
# [Model architecture](#contents)
|
||||
|
@ -27,27 +26,25 @@ The overall network architecture of EfficientNet-B0 is show below:
|
|||
|
||||
[Link](https://arxiv.org/abs/1905.11946)
|
||||
|
||||
|
||||
# [Dataset](#contents)
|
||||
|
||||
Dataset used: [imagenet](http://www.image-net.org/)
|
||||
|
||||
- Dataset size: ~125G, 1.2W colorful images in 1000 classes
|
||||
- Train: 120G, 1.2W images
|
||||
- Test: 5G, 50000 images
|
||||
- Train: 120G, 1.2W images
|
||||
- Test: 5G, 50000 images
|
||||
- Data format: RGB images.
|
||||
- Note: Data will be processed in src/dataset.py
|
||||
|
||||
- Note: Data will be processed in src/dataset.py
|
||||
|
||||
# [Environment Requirements](#contents)
|
||||
|
||||
- Hardware GPU
|
||||
- Prepare hardware environment with GPU processor.
|
||||
- Prepare hardware environment with GPU processor.
|
||||
- Framework
|
||||
- [MindSpore](https://www.mindspore.cn/install/en)
|
||||
- [MindSpore](https://www.mindspore.cn/install/en)
|
||||
- For more information, please check the resources below:
|
||||
- [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
|
||||
- [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
|
||||
- [MindSpore Tutorials](https://www.mindspore.cn/tutorial/training/en/master/index.html)
|
||||
- [MindSpore Python API](https://www.mindspore.cn/doc/api_python/en/master/index.html)
|
||||
|
||||
# [Script description](#contents)
|
||||
|
||||
|
@ -77,7 +74,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
|
|||
|
||||
Parameters for both training and evaluating can be set in config.py.
|
||||
|
||||
```
|
||||
```python
|
||||
'random_seed': 1, # fix random seed
|
||||
'model': 'efficientnet_b0', # model name
|
||||
'drop': 0.2, # dropout rate
|
||||
|
@ -106,17 +103,17 @@ Parameters for both training and evaluating can be set in config.py.
|
|||
|
||||
## [Training Process](#contents)
|
||||
|
||||
#### Usage
|
||||
### Usage
|
||||
|
||||
```
|
||||
```python
|
||||
GPU:
|
||||
# distribute training example(8p)
|
||||
sh run_distribute_train_for_gpu.sh
|
||||
sh run_distribute_train_for_gpu.sh
|
||||
# standalone training
|
||||
sh run_standalone_train_for_gpu.sh DEVICE_ID DATA_DIR
|
||||
```
|
||||
|
||||
#### Launch
|
||||
### Launch
|
||||
|
||||
```bash
|
||||
# distributed training example(8p) for GPU
|
||||
|
@ -133,7 +130,7 @@ You can find checkpoint file together with result in log.
|
|||
|
||||
### Usage
|
||||
|
||||
```
|
||||
```bash
|
||||
# Evaluation
|
||||
sh run_eval_for_gpu.sh DATA_DIR DEVICE_ID PATH_CHECKPOINT
|
||||
```
|
||||
|
@ -148,9 +145,9 @@ sh run_eval_for_gpu.sh /dataset/eval ./checkpoint/efficientnet_b0-600_1251.ckpt
|
|||
|
||||
#### Result
|
||||
|
||||
Evaluation result will be stored in the scripts path. Under this, you can find result like the followings in log.
|
||||
Evaluation result will be stored in the scripts path. Under this, you can find result like the following in log.
|
||||
|
||||
```
|
||||
```python
|
||||
acc=76.96%(TOP1)
|
||||
```
|
||||
|
||||
|
@ -186,7 +183,6 @@ acc=76.96%(TOP1)
|
|||
| outputs | probability |
|
||||
| Accuracy | acc=76.96%(TOP1) |
|
||||
|
||||
|
||||
# [ModelZoo Homepage](#contents)
|
||||
|
||||
|
||||
Please check the official [homepage](https://gitee.com/mindspore/mindspore/tree/master/model_zoo).
|
||||
|
|
|
@ -31,7 +31,7 @@ class RandAugment:
|
|||
self.hparams = hparams
|
||||
|
||||
def __call__(self, imgs, labels, batchInfo):
|
||||
# assert the imgs objetc are pil_images
|
||||
# assert the imgs object are pil_images
|
||||
ret_imgs = []
|
||||
ret_labels = []
|
||||
py_to_pil_op = P.ToPIL()
|
||||
|
|
Loading…
Reference in New Issue