!20491 add gpu scripts to tinydarknet
Merge pull request !20491 from ZeyangGAO/code_docs_tdngpu
This commit is contained in:
commit
a5b29dc4c9
|
@ -60,8 +60,8 @@ Dataset used can refer to [paper](<https://ieeexplore.ieee.org/abstract/document
|
|||
|
||||
# [Environment Requirements](#contents)
|
||||
|
||||
- Hardware(Ascend/CPU)
|
||||
- Prepare hardware environment with Ascend/CPU processor.
|
||||
- Hardware(Ascend/CPU/GPU)
|
||||
- Prepare hardware environment with Ascend/CPU processor/GPU.
|
||||
- Framework
|
||||
- [MindSpore](https://www.mindspore.cn/install/en)
|
||||
- For more information,please check the resources below:
|
||||
|
@ -93,6 +93,35 @@ After installing MindSpore via the official website, you can start training and
|
|||
|
||||
<https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools.>
|
||||
|
||||
- running on GPU with gpu default parameters
|
||||
|
||||
```python
|
||||
# GPU standalone training example
|
||||
python train.py \
|
||||
--config_path=./imagenet_config_gpu.yaml \
|
||||
--dataset_name=imagenet --train_data_dir=../dataset/imagenet_original/train --device_target=GPU
|
||||
OR
|
||||
cd scripts
|
||||
bash run_distribute_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10 | imagenet]
|
||||
|
||||
# GPU distribute training example
|
||||
export RANK_SIZE=8
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py \
|
||||
--config_path=./config/imagenet_config_gpu.yaml \
|
||||
--dataset_name=imagenet \
|
||||
--train_data_dir=../dataset/imagenet_original/train \
|
||||
--device_target=GPU
|
||||
OR
|
||||
bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10 | imagenet]
|
||||
|
||||
# GPU evaluation example
|
||||
python eval.py -device_target=GPU --val_data_dir=../dataset/imagenet_original/val --dataset_name=imagenet --config_path=./config/imagenet_config_gpu.yaml \
|
||||
--checkpoint_path=$PATH2
|
||||
OR
|
||||
bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path]
|
||||
```
|
||||
|
||||
- Running on ModelArts
|
||||
|
||||
If you want to run in modelarts, please check the official documentation of [modelarts](https://support.huaweicloud.com/modelarts/), and you can start training as follows.
|
||||
|
@ -155,12 +184,20 @@ For more details, please refer the specify script.
|
|||
├── README.md // descriptions about Tiny-Darknet in English
|
||||
├── README_CN.md // descriptions about Tiny-Darknet in Chinese
|
||||
├── ascend310_infer // application for 310 inference
|
||||
├── src
|
||||
├── imagenet_config.yaml // imagenet parameter configuration
|
||||
├── imagenet_config_gpu.yaml // imagenet parameter configuration for GPU
|
||||
├── cifar10_config.yaml // cifar10 parameter configuration
|
||||
├── cifar10_config_gpu.yaml // cifar10 parameter configuration for GPU
|
||||
├── scripts
|
||||
├── run_standalone_train.sh // shell script for single on Ascend
|
||||
├── run_standalone_train_gpu.sh // shell script for single on GPU
|
||||
├── run_distribute_train.sh // shell script for distributed on Ascend
|
||||
├── run_distribute_train_gpu.sh // shell script for distributed on GPU
|
||||
├── run_train_cpu.sh // shell script for distributed on CPU
|
||||
├── run_eval.sh // shell script for evaluation on Ascend
|
||||
├── run_eval_cpu.sh // shell script for evaluation on CPU
|
||||
├── run_eval_gpu.sh // shell script for evaluation on GPU
|
||||
├── run_infer_310.sh // shell script for inference on Ascend310
|
||||
├── src
|
||||
├── lr_scheduler //learning rate scheduler
|
||||
|
@ -179,8 +216,6 @@ For more details, please refer the specify script.
|
|||
├── train.py // training script
|
||||
├── eval.py // evaluation script
|
||||
├── export.py // export checkpoint file into air/onnx
|
||||
├── imagenet_config.yaml // imagenet parameter configuration
|
||||
├── cifar10_config.yaml // cifar10 parameter configuration
|
||||
├── mindspore_hub_conf.py // hub config
|
||||
├── postprocess.py // postprocess script
|
||||
|
||||
|
@ -252,6 +287,29 @@ For more configuration details, please refer the script `imagenet_config.yaml`.
|
|||
The model checkpoint file will be saved in the current folder.
|
||||
<!-- The model checkpoint will be saved in the current directory. -->
|
||||
|
||||
- running on GPU:
|
||||
|
||||
```python
|
||||
cd scripts
|
||||
bash run_standalone_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10|imagenet]
|
||||
```
|
||||
|
||||
The command above will run in the background, you can view the results through the file train.log.
|
||||
|
||||
After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows:
|
||||
<!-- After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows: -->
|
||||
|
||||
```python
|
||||
# grep "loss is " train.log
|
||||
epoch: 498 step: 1251, loss is 2.7798953
|
||||
Epoch time: 130690.544, per step time: 104.469
|
||||
epoch: 499 step: 1251, loss is 2.9261637
|
||||
Epoch time: 130511.081, per step time: 104.325
|
||||
epoch: 500 step: 1251, loss is 2.69412
|
||||
Epoch time: 127067.548, per step time: 101.573
|
||||
...
|
||||
```
|
||||
|
||||
- running on CPU
|
||||
|
||||
```python
|
||||
|
@ -279,6 +337,25 @@ For more configuration details, please refer the script `imagenet_config.yaml`.
|
|||
...
|
||||
```
|
||||
|
||||
- running on GPU:
|
||||
|
||||
```python
|
||||
bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10|imagenet]
|
||||
```
|
||||
|
||||
The above shell script will run distribute training in the background. You can view the results through the file train_parallel[X]/log. The loss value will be achieved as follows:
|
||||
|
||||
```python
|
||||
# grep "result: " distribute_train_gpu/nohup.out
|
||||
epoch: 498 step: 1251, loss is 2.7825122
|
||||
epoch time: 200066.210 ms, per step time: 159.925 ms
|
||||
epoch: 499 step: 1251, loss is 2.799798
|
||||
epoch time: 199098.258 ms, per step time: 159.151 ms
|
||||
epoch: 500 step: 1251, loss is 2.8718748
|
||||
epoch time: 197784.661 ms, per step time: 158.101 ms
|
||||
...
|
||||
```
|
||||
|
||||
## [Evaluation Process](#contents)
|
||||
|
||||
### [Evaluation](#contents)
|
||||
|
@ -307,6 +384,28 @@ For more configuration details, please refer the script `imagenet_config.yaml`.
|
|||
accuracy: {'top_1_accuracy': 0.5871979166666667, 'top_5_accuracy': 0.8175280448717949}
|
||||
```
|
||||
|
||||
- evaluation on Imagenet dataset when running on GPU:
|
||||
|
||||
Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "/username/tinydaeknet/train_tinydarknet.ckpt".
|
||||
|
||||
```python
|
||||
bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path]
|
||||
```
|
||||
|
||||
The above python command will run in the background. You can view the results through the file "eval.log". The accuracy of the test dataset will be as follows:
|
||||
|
||||
```python
|
||||
# grep "accuracy: " eval.log
|
||||
accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872}
|
||||
```
|
||||
|
||||
Note that for evaluation after distributed training, please set the checkpoint_path to be the last saved checkpoint file. The accuracy of the test dataset will be as follows:
|
||||
|
||||
```python
|
||||
# grep "accuracy: " eval.log
|
||||
accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872}
|
||||
```
|
||||
|
||||
- evaluation on cifar-10 dataset when running on CPU:
|
||||
|
||||
Before running the command below, please check the checkpoint path used for evaluation. Please set the checkpoint path to be the absolute full path, e.g., "/username/tinydaeknet/train_tinydarknet.ckpt".
|
||||
|
@ -389,34 +488,33 @@ Inference result is saved in current path, you can find result like this in acc.
|
|||
|
||||
### [Training Performance](#contents)
|
||||
|
||||
| Parameters | Ascend |
|
||||
| -------------------------- | ----------------------------------------------------------- |
|
||||
| Model Version | V1 |
|
||||
| Resource | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8 |
|
||||
| Uploaded Date | 2020/12/22 |
|
||||
| MindSpore Version | 1.1.0 |
|
||||
| Dataset | 1200k images |
|
||||
| Training Parameters | epoch=500, steps=1251, batch_size=128, lr=0.1 |
|
||||
| Optimizer | Momentum |
|
||||
| Loss Function | Softmax Cross Entropy |
|
||||
| Speed | 8 pc: 104 ms/step |
|
||||
| Total Time | 8 pc: 17.8 hours |
|
||||
| Parameters(M) | 4.0M |
|
||||
| Scripts | [Tiny-Darknet Scripts](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet) |
|
||||
| Parameters | Ascend | GPU |
|
||||
| -------------------------- | ------------------------------------------------------------| ----------------------------------------------------|
|
||||
| Model Version | V1 | V1 |
|
||||
| Resource | Ascend 910;CPU 2.60GHz,56cores;内存 314G;系统 Euler2.8 | PCIE V100-32G |
|
||||
| Uploaded Date | 2020/12/22 | 2021/07/15 |
|
||||
| MindSpore Version | 1.1.0 | 1.3.0 |
|
||||
| Dataset | 1200k images | 1200k images |
|
||||
| Training Parameters | epoch=500, steps=1251, batch_size=128, lr=0.1 | epoch=500, steps=1251, batch_size = 128, lr=0.005 |
|
||||
| Optimizer | Momentum | Momentum |
|
||||
| Loss Function | Softmax Cross Entropy | Softmax Cross Entropy |
|
||||
| Speed | 8pc: 104 ms/step | 8pc: 255 ms/step |
|
||||
| Parameters(M) | 4.0; | 4.0; |
|
||||
| Scripts | [Tiny-Darknet scripts](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet)
|
||||
|
||||
### [Evaluation Performance](#contents)
|
||||
|
||||
| Parameters | Ascend |
|
||||
| ------------------- | --------------------------- |
|
||||
| Model Version | V1 |
|
||||
| Resource | Ascend 910; OS Euler2.8 |
|
||||
| Uploaded Date | 2020/12/22 |
|
||||
| MindSpore Version | 1.1.0 |
|
||||
| Dataset | 200k images |
|
||||
| batch_size | 128 |
|
||||
| Outputs | probability |
|
||||
| Accuracy | 8 pc Top-1: 58.7%; Top-5: 81.7% |
|
||||
| Model for inference | 11.6M (.ckpt file) |
|
||||
| Parameters | Ascend | GPU |
|
||||
| ------------------- | ----------------------------------| ----------------------------------|
|
||||
| Model Version | V1 | V1 |
|
||||
| Resource | Ascend 910;Euler2.8 | PCIE V100-32G |
|
||||
| Uploaded Date | 2020/12/22 | 2021/7/15 |
|
||||
| MindSpore Version | 1.1.0 | 1.3.0 |
|
||||
| Dataset | 200k images | 200k images |
|
||||
| batch_size | 128 | 128 |
|
||||
| Outputs | probability | probability |
|
||||
| Accuracy | 8pcs Top-1: 58.7%; Top-5: 81.7% | 8pcs Top-1: 58.9%; Top-5: 81.7% |
|
||||
| Model for inference | 11.6M (.ckpt file) | 10.06M (.ckpt file) |
|
||||
|
||||
### [Inference Performance](#contents)
|
||||
|
||||
|
|
|
@ -68,8 +68,8 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的
|
|||
|
||||
# [环境要求](#目录)
|
||||
|
||||
- 硬件(Ascend/CPU)
|
||||
- 请准备具有Ascend/CPU处理器的硬件环境.
|
||||
- 硬件(Ascend/CPU/GPU)
|
||||
- 请准备具有Ascend/CPU处理器/GPU的硬件环境.
|
||||
- 框架
|
||||
- [MindSpore](https://www.mindspore.cn/install)
|
||||
- 更多的信息请访问以下链接:
|
||||
|
@ -101,6 +101,35 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的
|
|||
|
||||
<https://gitee.com/mindspore/mindspore/tree/master/model_zoo/utils/hccl_tools.>
|
||||
|
||||
- running on GPU with gpu default parameters
|
||||
|
||||
```python
|
||||
# GPU单卡训练示例
|
||||
python train.py \
|
||||
--config_path=./config/imagenet_config_gpu.yaml \
|
||||
--dataset_name=imagenet --train_data_dir=../dataset/imagenet_original/train --device_target=GPU
|
||||
OR
|
||||
cd scripts
|
||||
bash run_distribute_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10 | imagenet]
|
||||
|
||||
# GPU多卡训练示例
|
||||
export RANK_SIZE=8
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py \
|
||||
--config_path=./config/imagenet_config_gpu.yaml \
|
||||
--dataset_name=imagenet \
|
||||
--train_data_dir=../dataset/imagenet_original/train \
|
||||
--device_target=GPU
|
||||
OR
|
||||
bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10 | imagenet]
|
||||
|
||||
# GPU评估示例
|
||||
python eval.py -device_target=GPU --val_data_dir=../dataset/imagenet_original/val --dataset_name=imagenet --config_path=./config/imagenet_config_gpu.yaml \
|
||||
--checkpoint_path=$PATH2
|
||||
OR
|
||||
bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path]
|
||||
```
|
||||
|
||||
- 在ModelArts上运行
|
||||
如果你想在modelarts上运行,可以参考以下文档 [modelarts](https://support.huaweicloud.com/modelarts/)
|
||||
|
||||
|
@ -162,12 +191,20 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的
|
|||
├── README.md // Tiny-Darknet英文说明
|
||||
├── README_CN.md // Tiny-Darknet中文说明
|
||||
├── ascend310_infer // 用于310推理
|
||||
├── config
|
||||
├── imagenet_config.yaml // imagenet参数配置
|
||||
├── imagenet_config_gpu.yaml // imagenet参数配置
|
||||
├── cifar10_config.yaml // cifar10参数配置
|
||||
├── cifar10_config_gpu.yaml // cifar10参数配置
|
||||
├── scripts
|
||||
├── run_standalone_train.sh // Ascend单卡训练shell脚本
|
||||
├── run_standalone_train_gpu.sh // GPU单卡训练shell脚本
|
||||
├── run_distribute_train.sh // Ascend分布式训练shell脚本
|
||||
├── run_distribute_train_gpu.sh // GPU分布式训练shell脚本
|
||||
├── run_train_cpu.sh // CPU训练shell脚本
|
||||
├── run_eval.sh // Ascend评估shell脚本
|
||||
├── run_eval_cpu.sh // CPU评估shell脚本
|
||||
├── run_eval_gpu.sh // GPU评估shell脚本
|
||||
└── run_infer_310.sh // Ascend310推理shell脚本
|
||||
├── src
|
||||
├── lr_scheduler // 学习率策略
|
||||
|
@ -186,8 +223,6 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的
|
|||
├── train.py // 训练脚本
|
||||
├── eval.py // 评估脚本
|
||||
├── export.py // 导出checkpoint文件
|
||||
├── imagenet_config.yaml // imagenet参数配置
|
||||
├── cifar10_config.yaml // cifar10参数配置
|
||||
├── mindspore_hub_conf.py // hub配置文件
|
||||
└── postprocess.py // 310推理后处理脚本
|
||||
|
||||
|
@ -259,6 +294,29 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的
|
|||
模型checkpoint文件将会保存在当前文件夹下.
|
||||
<!-- The model checkpoint will be saved in the current directory. -->
|
||||
|
||||
- 在GPU资源上运行:
|
||||
|
||||
```python
|
||||
cd scripts
|
||||
bash run_standalone_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10|imagenet]
|
||||
```
|
||||
|
||||
上述的命令将运行在后台中,可以通过 `train_single_gpu/train.log` 文件查看运行结果.
|
||||
|
||||
训练完成后,默认情况下,可在script文件夹下得到一些checkpoint文件. 训练的损失值将以如下的形式展示:
|
||||
<!-- After training, you'll get some checkpoint files under the script folder by default. The loss value will be achieved as follows: -->
|
||||
|
||||
```python
|
||||
# grep "loss is " train.log
|
||||
epoch: 498 step: 1251, loss is 2.7798953
|
||||
Epoch time: 130690.544, per step time: 104.469
|
||||
epoch: 499 step: 1251, loss is 2.9261637
|
||||
Epoch time: 130511.081, per step time: 104.325
|
||||
epoch: 500 step: 1251, loss is 2.69412
|
||||
Epoch time: 127067.548, per step time: 101.573
|
||||
...
|
||||
```
|
||||
|
||||
- 在CPU资源上运行:
|
||||
|
||||
```python
|
||||
|
@ -273,16 +331,35 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的
|
|||
bash scripts/run_distribute_train.sh [RANK_TABLE_FILE]
|
||||
```
|
||||
|
||||
上述的脚本命令将在后台中进行分布式训练,可以通过`train_parallel[X]/log`文件查看运行结果. 训练的损失值将以如下的形式展示:
|
||||
上述的脚本命令将在后台中进行分布式训练,可以通过`distribute_train/nohup.out`文件查看运行结果. 训练的损失值将以如下的形式展示:
|
||||
|
||||
```python
|
||||
# grep "result: " train_parallel*/log
|
||||
epoch: 498 step: 1251, loss is 2.7798953
|
||||
Epoch time: 130690.544, per step time: 104.469
|
||||
epoch: 499 step: 1251, loss is 2.9261637
|
||||
Epoch time: 130511.081, per step time: 104.325
|
||||
epoch: 500 step: 1251, loss is 2.69412
|
||||
Epoch time: 127067.548, per step time: 101.573
|
||||
# grep "result: " distribute_train/nohup.out
|
||||
epoch: 498 step: 1251, loss is 2.7825122
|
||||
epoch time: 200066.210 ms, per step time: 159.925 ms
|
||||
epoch: 499 step: 1251, loss is 2.799798
|
||||
epoch time: 199098.258 ms, per step time: 159.151 ms
|
||||
epoch: 500 step: 1251, loss is 2.8718748
|
||||
epoch time: 197784.661 ms, per step time: 158.101 ms
|
||||
...
|
||||
```
|
||||
|
||||
- 在GPU资源上运行:
|
||||
|
||||
```python
|
||||
bash scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10|imagenet]
|
||||
```
|
||||
|
||||
上述的脚本命令将在后台中进行分布式训练,可以通过`distribute_train_gpu/nohup.out`文件查看运行结果. 训练的损失值将以如下的形式展示:
|
||||
|
||||
```python
|
||||
# grep "result: " distribute_train_gpu/nohup.out
|
||||
epoch: 498 step: 1251, loss is 2.7825122
|
||||
epoch time: 200066.210 ms, per step time: 159.925 ms
|
||||
epoch: 499 step: 1251, loss is 2.799798
|
||||
epoch time: 199098.258 ms, per step time: 159.151 ms
|
||||
epoch: 500 step: 1251, loss is 2.8718748
|
||||
epoch time: 197784.661 ms, per step time: 158.101 ms
|
||||
...
|
||||
```
|
||||
|
||||
|
@ -314,12 +391,34 @@ Tiny-DarkNet是Joseph Chet Redmon等人提出的一个16层的针对于经典的
|
|||
accuracy: {'top_1_accuracy': 0.5871979166666667, 'top_5_accuracy': 0.8175280448717949}
|
||||
```
|
||||
|
||||
- 在GPU资源上进行评估:
|
||||
|
||||
在运行如下命令前,请确认用于评估的checkpoint文件的路径.checkpoint文件须包含在tinydarknet文件夹内.请将checkpoint路径设置为相对于 eval.py文件 的路径,例如:"./ckpts/train_tinydarknet.ckpt"(ckpts 与 eval.py 同级).
|
||||
|
||||
```python
|
||||
bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path]
|
||||
```
|
||||
|
||||
上述的python命令将运行在后台中,可以通过"eval.log"文件查看结果. 测试数据集的准确率将如下面所列:
|
||||
|
||||
```python
|
||||
# grep "accuracy: " eval.log
|
||||
accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872}
|
||||
```
|
||||
|
||||
请注意在并行训练后,测试请将checkpoint_path设置为最后保存的checkpoint文件的路径,准确率将如下面所列:
|
||||
|
||||
```python
|
||||
# grep "accuracy: " eval.log
|
||||
accuracy: {'top_1_accuracy': 0.5896033653846153, 'top_5_accuracy': 0.8176482371794872}
|
||||
```
|
||||
|
||||
- 在CPU资源上进行评估
|
||||
|
||||
在运行如下命令前,请确认用于评估的checkpoint文件的路径.checkpoint文件须包含在tinydarknet文件夹内.请将checkpoint路径设置为相对于 eval.py文件 的路径,例如:"./ckpts/train_tinydarknet.ckpt"(ckpts 与 eval.py 同级).
|
||||
|
||||
```python
|
||||
bash scripts/run_eval.sh [VAL_DATA_DIR] [imagenet|cifar10] [CHECKPOINT_PATH]
|
||||
bash scripts/run_eval_cpu.sh [VAL_DATA_DIR] [imagenet|cifar10] [CHECKPOINT_PATH]
|
||||
```
|
||||
|
||||
可以通过"eval.log"文件查看结果. 测试数据集的准确率将如下面所列:
|
||||
|
@ -395,34 +494,36 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [LABEL_PATH] [DVPP] [DEVICE_ID]
|
|||
|
||||
### [训练性能](#目录)
|
||||
|
||||
| 参数 | Ascend |
|
||||
| -------------------------- | ----------------------------------------------------------- |
|
||||
| 模型版本 | V1 |
|
||||
| 资源 | Ascend 910;CPU 2.60GHz,56cores;内存 314G;系统 Euler2.8 |
|
||||
| 上传日期 | 2020/12/22 |
|
||||
| MindSpore版本 | 1.1.0 |
|
||||
| 数据集 | 1200k张图片 |
|
||||
| 训练参数 | epoch=500, steps=1251, batch_size=128, lr=0.1 |
|
||||
| 优化器 | Momentum |
|
||||
| 损失函数 | Softmax Cross Entropy |
|
||||
| 速度 | 8卡: 104 ms/step |
|
||||
| 总时间 | 8卡: 17.8小时 |
|
||||
| 参数(M) | 4.0 |
|
||||
| 脚本 | [Tiny-Darknet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet) |
|
||||
#### Tinydarknet on ImageNet 2012
|
||||
|
||||
| 参数 | Ascend | GPU |
|
||||
| -------------------------- | ------------------------------------------------------------| ----------------------------------------------------|
|
||||
| 模型版本 | V1 | V1 |
|
||||
| 资源 | Ascend 910;CPU 2.60GHz,56cores;内存 314G;系统 Euler2.8 | PCIE V100-32G |
|
||||
| 上传日期 | 2020/12/22 | 2021/07/15 |
|
||||
| MindSpore版本 | 1.1.0 | 1.3.0 |
|
||||
| 数据集 | 1200k张图片 | 1200k张图片 |
|
||||
| 训练参数 | epoch=500, steps=1251, batch_size=128, lr=0.1 | epoch=500, steps=1251, batch_size = 128, lr=0.005 |
|
||||
| 优化器 | Momentum | Momentum |
|
||||
| 损失函数 | Softmax Cross Entropy | Softmax Cross Entropy |
|
||||
| 速度 | 8卡: 104 ms/step | 8卡: 255 ms/step |
|
||||
| 总时间 | 8卡: 17.8小时 | 8卡: 46.9小时 |
|
||||
| 参数(M) | 4.0; | 4.0; |
|
||||
| 脚本 | [Tiny-Darknet脚本](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/tinydarknet)
|
||||
|
||||
### [评估性能](#目录)
|
||||
|
||||
| 参数 | Ascend |
|
||||
| ------------------- | --------------------------- |
|
||||
| 模型版本 | V1 |
|
||||
| 资源 | Ascend 910;系统 Euler2.8 |
|
||||
| 上传日期 | 2020/12/22 |
|
||||
| MindSpore版本 | 1.1.0 |
|
||||
| 数据集 | 200k张图片 |
|
||||
| batch_size | 128 |
|
||||
| 输出 | 分类概率 |
|
||||
| 准确率 | 8卡 Top-1: 58.7%; Top-5: 81.7% |
|
||||
| 推理模型 | 11.6M (.ckpt文件) |
|
||||
| 参数 | Ascend | GPU |
|
||||
| ------------------- | ----------------------------------| ----------------------------------|
|
||||
| 模型版本 | V1 | V1 |
|
||||
| 资源 | Ascend 910;系统 Euler2.8 | NV SMX2 V100-32G |
|
||||
| 上传日期 | 2020/12/22 | 2021/7/15 |
|
||||
| MindSpore版本 | 1.1.0 | 1.3.0 |
|
||||
| 数据集 | 200k张图片 | 200k张图片 |
|
||||
| batch_size | 128 | 128 |
|
||||
| 输出 | 分类概率 | 分类概率 |
|
||||
| 准确率 | 8卡 Top-1: 58.7%; Top-5: 81.7% | 8卡 Top-1: 58.9%; Top-5: 81.7% |
|
||||
| 推理模型 | 11.6M (.ckpt文件) | 10.06M (.ckpt文件) |
|
||||
|
||||
### [推理性能](#目录)
|
||||
|
||||
|
|
|
@ -0,0 +1,57 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "GPU"
|
||||
enable_profiling: False
|
||||
|
||||
modelarts_dataset_unzip_name: ''
|
||||
# ==============================================================================
|
||||
#train-eval-export related
|
||||
dataset_name : cifar10
|
||||
ckpt_save_dir: checkpoints
|
||||
pre_trained: False
|
||||
device_id: 0
|
||||
num_classes: 10
|
||||
lr_init: 0.1
|
||||
batch_size: 32
|
||||
epoch_size: 120
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
image_height: 227
|
||||
image_width: 227
|
||||
train_data_dir: './data/cifar10_train/'
|
||||
val_data_dir: './data/cifar10_val/'
|
||||
keep_checkpoint_max: 1
|
||||
checkpoint_path: './scripts/train_parallel4/ckpt_4/train_tinydarknet_imagenet-300_1251.ckpt'
|
||||
onnx_filename: 'tinydarknet.onnx'
|
||||
air_filename: 'tinydarknet.air'
|
||||
# optimizer and lr related
|
||||
lr_scheduler: 'exponential'
|
||||
lr_epochs: [70, 140, 210, 280]
|
||||
lr_gamma: 0.1
|
||||
eta_min: 0.0
|
||||
T_max: 150
|
||||
warmup_epochs: 0
|
||||
# loss related
|
||||
is_dynamic_loss_scale: False
|
||||
loss_scale: 1024
|
||||
label_smooth_factor: 0.1
|
||||
use_label_smooth: True
|
||||
|
||||
---
|
||||
|
||||
# Help description for each configuration
|
||||
enable_modelarts: "Whether training on modelarts, default: False"
|
||||
data_url: "Url for modelarts"
|
||||
train_url: "Url for modelarts"
|
||||
data_path: "The location of the input data."
|
||||
output_path: "The location of the output file."
|
||||
device_target: "Running platform, choose from Ascend, GPU or CPU, and default is Ascend."
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
|
@ -0,0 +1,61 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "GPU"
|
||||
enable_profiling: False
|
||||
|
||||
modelarts_dataset_unzip_name: ''
|
||||
# ==============================================================================
|
||||
#train-eval-export related
|
||||
dataset_name: imagenet
|
||||
ckpt_save_dir: checkpoints
|
||||
pre_trained: False
|
||||
device_id: 0
|
||||
num_classes: 1000
|
||||
lr_init: 0.1
|
||||
batch_size: 128
|
||||
epoch_size: 500
|
||||
momentum: 0.9
|
||||
weight_decay: 0.0001
|
||||
image_height: 224
|
||||
image_width: 224
|
||||
train_data_dir: './dataset/imagenet_original/train/'
|
||||
val_data_dir: './dataset/imagenet_original/val/'
|
||||
keep_checkpoint_max: 1
|
||||
checkpoint_path: './scripts/train_parallel4/ckpt_4/train_tinydarknet_imagenet-300_1251.ckpt'
|
||||
file_name: 'tinydarknet'
|
||||
file_format: 'MINDIR'
|
||||
# optimizer and lr related
|
||||
lr_scheduler: 'exponential'
|
||||
lr_epochs: [70, 140, 210, 280]
|
||||
lr_gamma: 0.3
|
||||
eta_min: 0.0
|
||||
T_max: 150
|
||||
warmup_epochs: 0
|
||||
# loss related
|
||||
is_dynamic_loss_scale: False
|
||||
loss_scale: 1024
|
||||
label_smooth_factor: 0.1
|
||||
use_label_smooth: True
|
||||
#310infer postprocess
|
||||
result_path: ''
|
||||
label_file: ''
|
||||
|
||||
---
|
||||
|
||||
# Help description for each configuration
|
||||
enable_modelarts: "Whether training on modelarts, default: False"
|
||||
data_url: "Url for modelarts"
|
||||
train_url: "Url for modelarts"
|
||||
data_path: "The location of the input data."
|
||||
output_path: "The location of the output file."
|
||||
device_target: "Running platform, choose from Ascend, GPU or CPU, and default is Ascend."
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
file_format: '["MINDIR", "AIR"]'
|
|
@ -57,7 +57,7 @@ do
|
|||
mkdir ./train_parallel$i
|
||||
cp -r ../src ./train_parallel$i
|
||||
cp ../train.py ./train_parallel$i
|
||||
cp ../*.yaml ./train_parallel$i
|
||||
cp -r ../config ./train_parallel$i
|
||||
echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type"
|
||||
cd ./train_parallel$i || exit
|
||||
env > env.log
|
||||
|
|
|
@ -0,0 +1,75 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [ $# != 3 ]; then
|
||||
echo "Usage: sh run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] [cifar10|imagenet]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path() {
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
dataset_type='imagenet'
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
if [ $3 != "cifar10" ] && [ $3 != "imagenet" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet"
|
||||
exit 1
|
||||
fi
|
||||
dataset_type=$3
|
||||
fi
|
||||
|
||||
export RANK_SIZE=$1
|
||||
PROJECT_DIR=$(cd ./"`dirname $0`" || exit; pwd)
|
||||
TRAIN_DATA_DIR=$(get_real_path $2)
|
||||
|
||||
if [ ! -d $TRAIN_DATA_DIR ]; then
|
||||
echo "error: TRAIN_DATA_DIR=$TRAIN_DATA_DIR is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -d "distribute_train_gpu" ]; then
|
||||
rm -rf ./distribute_train_gpu
|
||||
fi
|
||||
|
||||
mkdir ./distribute_train_gpu
|
||||
cp ./*.py ./distribute_train_gpu
|
||||
cp -r ./config ./distribute_train_gpu
|
||||
cp -r ./src ./distribute_train_gpu
|
||||
cd ./distribute_train_gpu || exit
|
||||
|
||||
if [ $3 == 'imagenet' ]; then
|
||||
CONFIG_FILE="$PROJECT_DIR/../config/imagenet_config_gpu.yaml"
|
||||
elif [ $3 == 'cifar10' ]; then
|
||||
CONFIG_FILE="$PROJECT_DIR/../config/cifar10_config_gpu.yaml"
|
||||
else
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
nohup python train.py \
|
||||
--config_path=$CONFIG_FILE \
|
||||
--dataset_name=$dataset_type \
|
||||
--train_data_dir=$TRAIN_DATA_DIR \
|
||||
--device_target=GPU > log.txt 2>&1 &
|
||||
cd ..
|
|
@ -22,7 +22,7 @@ rm -rf ./eval
|
|||
mkdir ./eval
|
||||
cp -r ../src ./eval
|
||||
cp ../eval.py ./eval
|
||||
cp ../*.yaml ./eval
|
||||
cp -r ../config ./eval
|
||||
cd ./eval || exit
|
||||
env >env.log
|
||||
python ./eval.py > ./eval.log 2>&1 &
|
||||
|
|
|
@ -43,9 +43,9 @@ fi
|
|||
|
||||
BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")")
|
||||
if [ $2 == 'imagenet' ]; then
|
||||
CONFIG_FILE="${BASE_PATH}/imagenet_config.yaml"
|
||||
CONFIG_FILE="${BASE_PATH}/config/imagenet_config.yaml"
|
||||
elif [ $2 == 'cifar10' ]; then
|
||||
CONFIG_FILE="${BASE_PATH}/cifar10_config.yaml"
|
||||
CONFIG_FILE="${BASE_PATH}/config/cifar10_config.yaml"
|
||||
else
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet"
|
||||
exit 1
|
||||
|
@ -55,7 +55,7 @@ rm -rf ./eval
|
|||
mkdir ./eval
|
||||
cp -r ./src ./eval
|
||||
cp ./eval.py ./eval
|
||||
cp ./*.yaml ./eval
|
||||
cp -r ./config ./eval
|
||||
env >env.log
|
||||
echo "start evaluation for device CPU"
|
||||
cd ./eval || exit
|
||||
|
|
|
@ -0,0 +1,64 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 1 ] && [ $# != 2 ] && [ $# != 3 ]
|
||||
then
|
||||
echo "Usage bash scripts/run_train_gpu.sh [VAL_DATA_DIR] [cifar10|imagenet] [checkpoint_path]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
PATH1=$(get_real_path $1)
|
||||
if [ ! -d $PATH1 ]
|
||||
then
|
||||
echo "error: VAL_DATA_DIR=$PATH1 is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
PATH2=$(get_real_path $3)
|
||||
if [ ! -f $PATH2 ]
|
||||
then
|
||||
echo "error: CHECKPOINT_PATH=$PATH2 is not a file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
BASE_PATH=$(dirname "$(dirname "$(readlink -f $0)")")
|
||||
if [ $2 == 'imagenet' ]; then
|
||||
CONFIG_FILE="${BASE_PATH}/config/imagenet_config_gpu.yaml"
|
||||
elif [ $2 == 'cifar10' ]; then
|
||||
CONFIG_FILE="${BASE_PATH}/config/cifar10_config_gpu.yaml"
|
||||
else
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
rm -rf ./eval
|
||||
mkdir ./eval
|
||||
cp -r ./src ./eval
|
||||
cp ./eval.py ./eval
|
||||
cp -r ./config ./eval
|
||||
env >env.log
|
||||
echo "start evaluation for device GPU"
|
||||
cd ./eval || exit
|
||||
python ./eval.py --device_target=GPU --val_data_dir=$PATH1 --dataset_name=$2 --config_path=$CONFIG_FILE \
|
||||
--checkpoint_path=$PATH2 > ./eval.log 2>&1 &
|
||||
cd ..
|
|
@ -55,7 +55,7 @@ rm -rf ./train_single
|
|||
mkdir ./train_single
|
||||
cp -r ../src ./train_single
|
||||
cp ../train.py ./train_single
|
||||
cp ../*.yaml ./train_single
|
||||
cp -r ../config ./train_single
|
||||
echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type"
|
||||
cd ./train_single || exit
|
||||
python ./train.py --dataset_name=$dataset_type --train_data_dir=$train_data_dir> ./train.log 2>&1 &
|
||||
|
|
|
@ -0,0 +1,73 @@
|
|||
#!/usr/bin/env bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
echo "$1 $2 $3"
|
||||
|
||||
if [ $# != 2 ] && [ $# != 3 ]
|
||||
then
|
||||
echo "Usage: bash run_distribute_train_gpu.sh [DEVICE_ID] [TRAIN_DATA_DIR] [cifar10|imagenet]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
expr $1 + 6 &>/dev/null
|
||||
if [ $? != 0 ]
|
||||
then
|
||||
echo "error:DEVICE_ID=$1 is not a integer"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ ! -d $2 ]
|
||||
then
|
||||
echo "error:TRAIN_DATA_DIR=$2 is not a folder"
|
||||
exit 1
|
||||
fi
|
||||
train_data_dir=$2
|
||||
PROJECT_DIR=$(cd ./"`dirname $0`" || exit; pwd)
|
||||
CONFIG_FILE="$PROJECT_DIR/../config/imagenet_config_gpu.yaml"
|
||||
dataset_type='imagenet'
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
if [ $3 != "cifar10" ] && [ $3 != "imagenet" ]
|
||||
then
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet"
|
||||
exit 1
|
||||
fi
|
||||
dataset_type=$3
|
||||
fi
|
||||
|
||||
if [ $3 == 'imagenet' ]; then
|
||||
CONFIG_FILE="$PROJECT_DIR/../config/imagenet_config_gpu.yaml"
|
||||
elif [ $3 == 'cifar10' ]; then
|
||||
CONFIG_FILE="$PROJECT_DIR/../config/cifar10_config_gpu.yaml"
|
||||
else
|
||||
echo "error: the selected dataset is neither cifar10 nor imagenet"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
export DEVICE_ID=$1
|
||||
export RANK_ID=0
|
||||
export DEVICE_NUM=1
|
||||
export RANK_SIZE=1
|
||||
rm -rf ./train_single_gpu
|
||||
mkdir ./train_single_gpu
|
||||
cp -r ../src ./train_single_gpu
|
||||
cp ../train.py ./train_single_gpu
|
||||
cp -r ../config ./train_single_gpu
|
||||
echo "start training for rank $RANK_ID, device $DEVICE_ID, $dataset_type"
|
||||
cd ./train_single_gpu || exit
|
||||
python ./train.py --config_path=$CONFIG_FILE \
|
||||
--dataset_name=$dataset_type --train_data_dir=$train_data_dir --device_target=GPU> ./train.log 2>&1 &
|
||||
|
|
@ -49,7 +49,7 @@ rm -rf ./train_cpu
|
|||
mkdir ./train_cpu
|
||||
cp ./train.py ./train_cpu
|
||||
cp -r ./src ./train_cpu
|
||||
cp ./*.yaml ./train_cpu
|
||||
cp -r ./config ./train_cpu
|
||||
echo "start training for device CPU"
|
||||
cd ./train_cpu || exit
|
||||
env > env.log
|
||||
|
|
|
@ -40,14 +40,10 @@ def create_dataset_cifar(dataset_path,
|
|||
Returns:
|
||||
dataset
|
||||
"""
|
||||
if target == "Ascend":
|
||||
device_num, rank_id = _get_rank_info()
|
||||
elif target == "CPU":
|
||||
if target == "CPU":
|
||||
device_num = 1
|
||||
else:
|
||||
init()
|
||||
rank_id = get_rank()
|
||||
device_num = get_group_size()
|
||||
device_num, rank_id = _get_rank_info()
|
||||
|
||||
if device_num == 1:
|
||||
data_set = ds.Cifar10Dataset(dataset_path,
|
||||
|
@ -165,7 +161,8 @@ def _get_rank_info():
|
|||
rank_size = int(os.environ.get("RANK_SIZE", 1))
|
||||
|
||||
if rank_size > 1:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
init()
|
||||
rank_size = get_group_size()
|
||||
rank_id = get_rank()
|
||||
else:
|
||||
|
|
|
@ -117,7 +117,7 @@ def get_config():
|
|||
"""
|
||||
parser = argparse.ArgumentParser(description="default name", add_help=False)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../{}".format(_config)),
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../config/{}".format(_config)),
|
||||
help="Config file path")
|
||||
path_args, _ = parser.parse_known_args()
|
||||
default, helper, choices = parse_yaml(path_args.config_path)
|
||||
|
|
|
@ -21,7 +21,7 @@ import time
|
|||
|
||||
from mindspore import Tensor
|
||||
from mindspore import context
|
||||
from mindspore.communication.management import init
|
||||
from mindspore.communication.management import init, get_rank
|
||||
from mindspore.nn.optim.momentum import Momentum
|
||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
|
||||
from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
|
||||
|
@ -36,7 +36,7 @@ from src.tinydarknet import TinyDarkNet
|
|||
from src.CrossEntropySmooth import CrossEntropySmooth
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
|
||||
from src.model_utils.device_adapter import get_device_id, get_device_num
|
||||
|
||||
set_seed(1)
|
||||
|
||||
|
@ -132,11 +132,11 @@ def run_train():
|
|||
else:
|
||||
context.set_context(device_id=get_device_id())
|
||||
if device_num > 1:
|
||||
init()
|
||||
context.reset_auto_parallel_context()
|
||||
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True)
|
||||
init()
|
||||
rank = get_rank_id()
|
||||
rank = get_rank()
|
||||
|
||||
if config.dataset_name == "imagenet":
|
||||
dataset = create_dataset_imagenet(config.train_data_dir, 1)
|
||||
|
@ -204,10 +204,12 @@ def run_train():
|
|||
|
||||
if device_target == "CPU":
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'}, loss_scale_manager=loss_scale_manager)
|
||||
else:
|
||||
elif device_target == "Ascend":
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
|
||||
amp_level="O3", loss_scale_manager=loss_scale_manager)
|
||||
|
||||
elif device_target == "GPU":
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
|
||||
amp_level="O2", loss_scale_manager=loss_scale_manager)
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 50, keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
time_cb = TimeMonitor(data_size=batch_num)
|
||||
ckpt_save_dir = os.path.join(config.ckpt_save_dir, str(rank))
|
||||
|
|
Loading…
Reference in New Issue