forked from mindspore-Ecosystem/mindspore
Combine resnext50 and resnext101
This commit is contained in:
parent
43174475e6
commit
2c5aeec322
|
@ -1,6 +1,6 @@
|
|||
# Contents
|
||||
|
||||
- [ResNeXt50 Description](#resnext50-description)
|
||||
- [ResNeXt Description](#resnext-description)
|
||||
- [Model Architecture](#model-architecture)
|
||||
- [Dataset](#dataset)
|
||||
- [Features](#features)
|
||||
|
@ -21,7 +21,7 @@
|
|||
- [Description of Random Situation](#description-of-random-situation)
|
||||
- [ModelZoo Homepage](#modelzoo-homepage)
|
||||
|
||||
# [ResNeXt50 Description](#contents)
|
||||
# [ResNeXt Description](#contents)
|
||||
|
||||
ResNeXt is a simple, highly modularized network architecture for image classification. It designs results in a homogeneous, multi-branch architecture that has only a few hyper-parameters to set in ResNeXt. This strategy exposes a new dimension, which we call “cardinality” (the size of the set of transformations), as an essential factor in addition to the dimensions of depth and width.
|
||||
|
||||
|
@ -70,7 +70,7 @@ If you want to run in modelarts, please check the official documentation of [mod
|
|||
# Set other parameters on yaml file you need.
|
||||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add other parameters on the website UI interface.
|
||||
# (2) Set the code directory to "/path/resnext50" on the website UI interface.
|
||||
# (2) Set the code directory to "/path/resnext" on the website UI interface.
|
||||
# (3) Set the startup file to "train.py" on the website UI interface.
|
||||
# (4) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (5) Create your job.
|
||||
|
@ -84,7 +84,7 @@ If you want to run in modelarts, please check the official documentation of [mod
|
|||
# b. Add "enable_modelarts=True" on the website UI interface.
|
||||
# Add "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" on the website UI interface.
|
||||
# Add "checkpoint_url=/The path of checkpoint in S3/" on the website UI interface.
|
||||
# (3) Set the code directory to "/path/resnext50" on the website UI interface.
|
||||
# (3) Set the code directory to "/path/resnext" on the website UI interface.
|
||||
# (4) Set the startup file to "eval.py" on the website UI interface.
|
||||
# (5) Set the "Dataset path" and "Output file path" and "Job log path" to your path on the website UI interface.
|
||||
# (6) Create your job.
|
||||
|
@ -96,7 +96,7 @@ If you want to run in modelarts, please check the official documentation of [mod
|
|||
|
||||
```python
|
||||
.
|
||||
└─resnext50
|
||||
└─resnext
|
||||
├─README.md
|
||||
├─scripts
|
||||
├─run_standalone_train.sh # launch standalone training for ascend(1p)
|
||||
|
@ -107,7 +107,7 @@ If you want to run in modelarts, please check the official documentation of [mod
|
|||
├─src
|
||||
├─backbone
|
||||
├─_init_.py # initialize
|
||||
├─resnet.py # resnext50 backbone
|
||||
├─resnet.py # resnext backbone
|
||||
├─utils
|
||||
├─_init_.py # initialize
|
||||
├─cunstom_op.py # network operation
|
||||
|
@ -230,7 +230,7 @@ PLATFORM is Ascend or GPU, default is Ascend.
|
|||
|
||||
```bash
|
||||
# Evaluation with checkpoint
|
||||
sh scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /resnext50_100.ckpt Ascend
|
||||
sh scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /resnext_100.ckpt Ascend
|
||||
```
|
||||
|
||||
#### Result
|
||||
|
@ -268,10 +268,14 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID]
|
|||
|
||||
Inference result is saved in current path, you can find result in acc.log file.
|
||||
|
||||
```log
|
||||
```resnext50
|
||||
Total data:50000, top1 accuracy:0.78462, top5 accuracy:0.94182
|
||||
```
|
||||
|
||||
```resnext101
|
||||
Total data:50000, top1 accuracy:0.79858, top5 accuracy:0.94716
|
||||
```
|
||||
|
||||
# [Model description](#contents)
|
||||
|
||||
## [Performance](#contents)
|
||||
|
@ -284,7 +288,7 @@ Total data:50000, top1 accuracy:0.78462, top5 accuracy:0.94182
|
|||
| uploaded Date | 06/30/2020 | 07/23/2020 |
|
||||
| MindSpore Version | 0.5.0 | 0.6.0 |
|
||||
| Dataset | ImageNet | ImageNet |
|
||||
| Training Parameters | src/config.py | src/config.py |
|
||||
| Training Parameters | default_config.yaml | default_config.yaml |
|
||||
| Optimizer | Momentum | Momentum |
|
||||
| Loss Function | SoftmaxCrossEntropy | SoftmaxCrossEntropy |
|
||||
| Loss | 1.76592 | 1.8965 |
|
||||
|
@ -292,9 +296,21 @@ Total data:50000, top1 accuracy:0.78462, top5 accuracy:0.94182
|
|||
| Total time | 7.8 h 8ps | 21.5 h 8ps |
|
||||
| Checkpoint for Fine tuning | 192 M(.ckpt file) | 192 M(.ckpt file) |
|
||||
|
||||
| Parameters | ResNeXt101 | |
|
||||
| -------------------------- | ---------------------------------------------------------- |
|
||||
| Resource | Ascend 910; cpu 2.60GHz, 192cores; memory 755G; OS Euler2.8|
|
||||
| uploaded Date | 22/06/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.2.0 |
|
||||
| Dataset | ImageNet |
|
||||
| Training Parameters | default_config.yaml |
|
||||
| Optimizer | Momentum |
|
||||
| Loss Function | SoftmaxCrossEntropy |
|
||||
| Accuracy | 79.56%%(TOP1) |
|
||||
| train performance | 196.33image/sec 1ps |
|
||||
|
||||
#### Inference Performance
|
||||
|
||||
| Parameters | | | |
|
||||
| Parameters | ResNeXt50 | | |
|
||||
| -------------------------- | ----------------------------- | ------------------------- | -------------------- |
|
||||
| Resource | Ascend 910; OS Euler2.8 | NV SMX2 V100-32G | Ascend 310 |
|
||||
| uploaded Date | 06/30/2020 | 07/23/2020 | 07/23/2020 |
|
||||
|
@ -304,6 +320,17 @@ Total data:50000, top1 accuracy:0.78462, top5 accuracy:0.94182
|
|||
| outputs | probability | probability | probability |
|
||||
| Accuracy | acc=78.16%(TOP1) | acc=78.05%(TOP1) | |
|
||||
|
||||
| Parameters | Ascend |
|
||||
| ------------------- | --------------------------- |
|
||||
| Model Version | ResNeXt101 |
|
||||
| Resource | Ascend 310; OS Euler2.8 |
|
||||
| Uploaded Date | 22/06/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.2.0 |
|
||||
| Dataset | ImageNet |
|
||||
| batch_size | 1 |
|
||||
| outputs | Accuracy |
|
||||
| Accuracy | TOP1: 79.85% |
|
||||
|
||||
# [Description of Random Situation](#contents)
|
||||
|
||||
In dataset.py, we set the seed inside “create_dataset" function. We also use random seed in train.py.
|
|
@ -1,7 +1,7 @@
|
|||
# 目录
|
||||
|
||||
- [目录](#目录)
|
||||
- [ResNeXt50说明](#resnext50说明)
|
||||
- [ResNeXt说明](#resnext说明)
|
||||
- [模型架构](#模型架构)
|
||||
- [数据集](#数据集)
|
||||
- [特性](#特性)
|
||||
|
@ -28,7 +28,7 @@
|
|||
- [随机情况说明](#随机情况说明)
|
||||
- [ModelZoo主页](#modelzoo主页)
|
||||
|
||||
# ResNeXt50说明
|
||||
# ResNeXt说明
|
||||
|
||||
ResNeXt是一个简单、高度模块化的图像分类网络架构。ResNeXt的设计为统一的、多分支的架构,该架构仅需设置几个超参数。此策略提供了一个新维度,我们将其称为“基数”(转换集的大小),它是深度和宽度维度之外的一个重要因素。
|
||||
|
||||
|
@ -73,12 +73,12 @@ ResNeXt整体网络架构如下:
|
|||
|
||||
```python
|
||||
# 在modelarts上使用分布式训练的示例:
|
||||
# (1) 选址a或者b其中一种方式。
|
||||
# (1) 选择a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True" 。
|
||||
# 在yaml文件上设置网络所需的参数。
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 在modelarts的界面上设置网络所需的参数。
|
||||
# (2) 在modelarts的界面上设置代码的路径 "/path/resnext50"。
|
||||
# (2) 在modelarts的界面上设置代码的路径 "/path/ResNeXt"。
|
||||
# (3) 在modelarts的界面上设置模型的启动文件 "train.py" 。
|
||||
# (4) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
|
@ -93,7 +93,7 @@ ResNeXt整体网络架构如下:
|
|||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
|
||||
# (3) 在modelarts的界面上设置代码的路径 "/path/resnext50"。
|
||||
# (3) 在modelarts的界面上设置代码的路径 "/path/ResNeXt"。
|
||||
# (4) 在modelarts的界面上设置模型的启动文件 "eval.py" 。
|
||||
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
|
@ -106,7 +106,7 @@ ResNeXt整体网络架构如下:
|
|||
|
||||
```path
|
||||
.
|
||||
└─resnext50
|
||||
└─ResNeXt
|
||||
├─README.md
|
||||
├─scripts
|
||||
├─run_standalone_train.sh # 启动Ascend单机训练(单卡)
|
||||
|
@ -117,7 +117,7 @@ ResNeXt整体网络架构如下:
|
|||
├─src
|
||||
├─backbone
|
||||
├─_init_.py # 初始化
|
||||
├─resnet.py # ResNeXt50骨干
|
||||
├─resnet.py # ResNeXt骨干
|
||||
├─utils
|
||||
├─_init_.py # 初始化
|
||||
├─cunstom_op.py # 网络操作
|
||||
|
@ -159,7 +159,7 @@ ResNeXt整体网络架构如下:
|
|||
"eta_min": 0, # cosine_annealing调度器中的eta_min
|
||||
"T_max": 150, # cosine_annealing调度器中的T-max
|
||||
"max_epoch": 150, # 训练模型的最大轮次数量
|
||||
"backbone": 'resnext50', # 骨干网络
|
||||
"backbone": 'ResNeXt', # 骨干网络
|
||||
"warmup_epochs" : 1, # 热身轮次
|
||||
"weight_decay": 0.0001, # 权重衰减
|
||||
"momentum": 0.9, # 动量
|
||||
|
@ -238,18 +238,23 @@ DEVICE_TARGET is Ascend or GPU, default is Ascend.
|
|||
|
||||
```shell
|
||||
# 检查点评估
|
||||
sh scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /resnext50_100.ckpt Ascend
|
||||
sh scripts/run_eval.sh 0 /opt/npu/datasets/classification/val /ResNeXt_100.ckpt Ascend
|
||||
```
|
||||
|
||||
#### 结果
|
||||
|
||||
评估结果保存在脚本路径下。您可以在日志中找到类似以下的结果。
|
||||
|
||||
```log
|
||||
```resnext50
|
||||
acc=78.16%(TOP1)
|
||||
acc=93.88%(TOP5)
|
||||
```
|
||||
|
||||
```resnext101
|
||||
acc=79.86%(TOP1)
|
||||
acc=94.72%(TOP5)
|
||||
```
|
||||
|
||||
## 模型导出
|
||||
|
||||
```shell
|
||||
|
@ -277,10 +282,14 @@ bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID]
|
|||
|
||||
推理结果保存在当前路径,可在acc.log中看到最终精度结果。
|
||||
|
||||
```log
|
||||
```resnext50
|
||||
Total data:50000, top1 accuracy:0.78462, top5 accuracy:0.94182
|
||||
```
|
||||
|
||||
```resnext101
|
||||
Total data:50000, top1 accuracy:0.79858, top5 accuracy:0.94716
|
||||
```
|
||||
|
||||
# 模型描述
|
||||
|
||||
## 性能
|
||||
|
@ -303,7 +312,7 @@ Total data:50000, top1 accuracy:0.78462, top5 accuracy:0.94182
|
|||
|
||||
#### 推理性能
|
||||
|
||||
| 参数 | | | |
|
||||
| 参数 |ResNeXt50 | | |
|
||||
| -------------------------- | ----------------------------- | ------------------------- | -------------------- |
|
||||
| 资源 | Ascend 910;系统 Euler2.8 | NV SMX2 V100-32G | Ascend 310 |
|
||||
| 上传日期 | 2020-6-30 | 2020-7-23 | 2020-7-23 |
|
||||
|
@ -313,6 +322,16 @@ Total data:50000, top1 accuracy:0.78462, top5 accuracy:0.94182
|
|||
| 输出 | 概率 | 概率 | 概率 |
|
||||
| 准确率 | acc=78.16%(TOP1) | acc=78.05%(TOP1) | |
|
||||
|
||||
| 参数 | ResNeXt101 |
|
||||
| ------------------- | --------------------------- |
|
||||
| 资源 | Ascend 310; OS Euler2.8 |
|
||||
| 上传日期 | 22/06/2021 (month/day/year) |
|
||||
| MindSpore版本 | 1.2.0 |
|
||||
| 数据集 | ImageNet |
|
||||
| batch_size | 1 |
|
||||
| 输出 | 概率 |
|
||||
| 准确率 | TOP1: 79.85%, TOP5: 94.71% |
|
||||
|
||||
# 随机情况说明
|
||||
|
||||
dataset.py中设置了“create_dataset”函数内的种子,同时还使用了train.py中的随机种子。
|
|
@ -1,5 +1,6 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
network: "resnext50"
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
|
@ -48,13 +49,14 @@ log_path: './output_log'
|
|||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnext101"
|
||||
file_name: "resnext50"
|
||||
file_format: "AIR"
|
||||
result_path: ""
|
||||
label_path: ""
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
network: "Candidate networks: [resnext50, resnext101], default: resnext50"
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
train_url: 'Training output url for obs'
|
|
@ -159,7 +159,7 @@ def test(cloud_args=None):
|
|||
max_epoch=1, rank=config.rank, group_size=config.group_size,
|
||||
mode='eval')
|
||||
eval_dataloader = de_dataset.create_tuple_iterator(output_numpy=True, num_epochs=1)
|
||||
network = get_network(num_classes=config.num_classes, platform=config.device_target)
|
||||
network = get_network(network=config.network, num_classes=config.num_classes, platform=config.device_target)
|
||||
|
||||
load_pretrain_model(model, network, config)
|
||||
|
|
@ -28,7 +28,7 @@ if config.device_target == "Ascend":
|
|||
context.set_context(device_id=config.device_id)
|
||||
|
||||
if __name__ == '__main__':
|
||||
network = get_network(num_classes=config.num_classes, platform=config.device_target)
|
||||
network = get_network(network=config.network, num_classes=config.num_classes, platform=config.device_target)
|
||||
|
||||
param_dict = load_checkpoint(config.checkpoint_file_path)
|
||||
load_param_into_net(network, param_dict)
|
|
@ -23,7 +23,7 @@ from mindspore.common.initializer import TruncatedNormal
|
|||
from src.utils.cunstom_op import SEBlock, GroupConv
|
||||
|
||||
|
||||
__all__ = ['ResNet', 'resnext50']
|
||||
__all__ = ['ResNet', 'resnext50', 'resnext101']
|
||||
|
||||
|
||||
def weight_variable(shape, factor=0.1):
|
||||
|
@ -277,3 +277,6 @@ class ResNet(nn.Cell):
|
|||
|
||||
def resnext50(platform="Ascend"):
|
||||
return ResNet(Bottleneck, [3, 4, 6, 3], width_per_group=4, groups=32, platform=platform)
|
||||
|
||||
def resnext101(platform="Ascend"):
|
||||
return ResNet(Bottleneck, [3, 4, 23, 3], width_per_group=4, groups=32, platform=platform)
|
|
@ -94,5 +94,9 @@ class Resnet(ImageClassificationNetwork):
|
|||
|
||||
|
||||
|
||||
def get_network(**kwargs):
|
||||
return Resnet('resnext50', **kwargs)
|
||||
def get_network(network, **kwargs):
|
||||
if network not in ['resnext50', 'resnext101']:
|
||||
raise NotImplementedError(f"The network {network} not in [resnext50, resnext101].")
|
||||
if network == 'resnext50':
|
||||
return Resnet('resnext50', **kwargs)
|
||||
return Resnet('resnext101', **kwargs)
|
|
@ -158,7 +158,7 @@ def train():
|
|||
# network
|
||||
config.logger.important_info('start create network')
|
||||
# get network and init
|
||||
network = get_network(num_classes=config.num_classes, platform=config.device_target)
|
||||
network = get_network(network=config.network, num_classes=config.num_classes, platform=config.device_target)
|
||||
|
||||
load_pretrain_model(config.checkpoint_file_path, network, config)
|
||||
|
|
@ -1,304 +0,0 @@
|
|||
# ResNext101-64x4d
|
||||
|
||||
本仓库提供了ResNeXt101-64x4d模型的训练脚本和超参配置,以达到论文中的准确性。
|
||||
|
||||
## 模型概述
|
||||
|
||||
模型名称:ResNeXt101
|
||||
|
||||
论文:`"Aggregated Residual Transformations for Deep Neural Networks" <https://arxiv.org/pdf/1611.05431.pdf>`
|
||||
|
||||
这里提供的版本是ResNeXt101-64x4d
|
||||
|
||||
### 模型架构
|
||||
|
||||
ResNeXt是ResNet网络的改进版本,比ResNet的网络多了块多了cardinality设置。ResNeXt101-64x4d的网络结构如下:
|
||||
|
||||
| 网络层 | 输出 | 参数 |
|
||||
| ---------- | ------- | ------------------------------------------- |
|
||||
| conv1 | 112x112 | 7x7,64,stride 2 |
|
||||
| maxpooline | 56x56 | 3x3,stride 2 |
|
||||
| conv2 | 56x56 | [(1x1,64)->(3x3,64)->(1x1,256) C=64]x3 |
|
||||
| conv3 | 28x28 | [(1x1,256)->(3x3,256)->(1x1,512) C=64]x4 |
|
||||
| conv4 | 14x14 | [(1x1,512)->(3x3,512)->(1x1,1024) C=64]x23 |
|
||||
| conv5 | 7x7 | [(1x1,1024)->(3x3,1024)->(1x1,2048) C=64]x3 |
|
||||
| | 1x1 | average pool;1000-d fc;softmax |
|
||||
|
||||
### 默认设置
|
||||
|
||||
以下各节介绍ResNext50模型的默认配置和超参数。
|
||||
|
||||
#### 优化器
|
||||
|
||||
本模型使用Mindspore框架提供的Momentum优化器,超参设置如下:
|
||||
|
||||
- Momentum : 0.9
|
||||
- Learning rate (LR) : 0.05
|
||||
- LR schedule: cosine_annealing
|
||||
- LR epochs: [30, 60, 90, 120]
|
||||
- LR gamma: 0.1
|
||||
- Batch size : 64
|
||||
- Weight decay : 0.0001.
|
||||
- Label smoothing = 0.1
|
||||
- Eta_min: 0
|
||||
- Warmup_epochs: 1
|
||||
- Loss_scale: 1024
|
||||
- 训练轮次:
|
||||
- 150 epochs
|
||||
|
||||
#### 数据增强
|
||||
|
||||
本模型使用了以下数据增强:
|
||||
|
||||
- 对于训练脚本:
|
||||
- RandomResizeCrop, scale=(0.08, 1.0), ratio=(0.75, 1.333)
|
||||
- RandomHorizontalFlip, prob=0.5
|
||||
- Normalize, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
|
||||
- 对于验证(前向推理):
|
||||
- Resize to (256, 256)
|
||||
- CenterCrop to (224, 224)
|
||||
- Normalize, mean=(0.485, 0.456, 0.406), std=(0.229, 0.224, 0.225)
|
||||
|
||||
## 设定
|
||||
|
||||
以下各节列出了开始训练ResNext101-64x4d模型的要求。
|
||||
|
||||
如果要在modelarts上进行模型的训练,可以参考modelarts的官方指导文档(https://support.huaweicloud.com/modelarts/)
|
||||
开始进行模型的训练和推理,具体操作如下:
|
||||
|
||||
```python
|
||||
# 在modelarts上使用分布式训练的示例:
|
||||
# (1) 选址a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True" 。
|
||||
# 在yaml文件上设置网络所需的参数。
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 在modelarts的界面上设置网络所需的参数。
|
||||
# (2) 在modelarts的界面上设置代码的路径 "/path/resnext101"。
|
||||
# (3) 在modelarts的界面上设置模型的启动文件 "train.py" 。
|
||||
# (4) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
# (5) 开始模型的训练。
|
||||
|
||||
# 在modelarts上使用模型推理的示例
|
||||
# (1) 把训练好的模型地方到桶的对应位置。
|
||||
# (2) 选址a或者b其中一种方式。
|
||||
# a. 设置 "enable_modelarts=True"
|
||||
# 设置 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt" 在 yaml 文件.
|
||||
# 设置 "checkpoint_url=/The path of checkpoint in S3/" 在 yaml 文件.
|
||||
# b. 增加 "enable_modelarts=True" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_file_path='/cache/checkpoint_path/model.ckpt'" 参数在modearts的界面上。
|
||||
# 增加 "checkpoint_url=/The path of checkpoint in S3/" 参数在modearts的界面上。
|
||||
# (3) 在modelarts的界面上设置代码的路径 "/path/resnext101"。
|
||||
# (4) 在modelarts的界面上设置模型的启动文件 "eval.py" 。
|
||||
# (5) 在modelarts的界面上设置模型的数据路径 "Dataset path" ,
|
||||
# 模型的输出路径"Output file path" 和模型的日志路径 "Job log path" 。
|
||||
# (6) 开始模型的推理。
|
||||
```
|
||||
|
||||
## 快速入门指南
|
||||
|
||||
目录说明,代码参考了Modelzoo上的[ResNext50](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/resnext50)
|
||||
|
||||
```path
|
||||
.
|
||||
└─resnext101-64x4d-mindspore
|
||||
├─README.md
|
||||
├─ascend310_infer #310推理依赖的应用
|
||||
├─scripts
|
||||
├─run_standalone_train.sh # 启动Ascend单机训练(单卡)
|
||||
├─run_distribute_train.sh # 启动Ascend分布式训练(8卡)
|
||||
├─run_standalone_train_for_gpu.sh # 启动GPU单机训练(单卡)
|
||||
├─run_distribute_train_for_gpu.sh # 启动GPU分布式训练(8卡)
|
||||
├─run_infer_310.sh # 启动Ascend310推理
|
||||
└─run_eval.sh # 启动评估
|
||||
├─src
|
||||
├─backbone
|
||||
├─_init_.py # 初始化
|
||||
├─resnext.py # ResNeXt101骨干
|
||||
├─utils
|
||||
├─_init_.py # 初始化
|
||||
├─cunstom_op.py # 网络操作
|
||||
├─logging.py # 打印日志
|
||||
├─optimizers_init_.py # 获取参数
|
||||
├─sampler.py # 分布式采样器
|
||||
├─var_init_.py # 计算增益值
|
||||
├─_init_.py # 初始化
|
||||
├─config.py # 参数配置
|
||||
├─crossentropy.py # 交叉熵损失函数
|
||||
├─dataset.py # 数据预处理
|
||||
├─head.py # 常见头
|
||||
├─image_classification.py # 获取ResNet
|
||||
├─linear_warmup.py # 线性热身学习率
|
||||
├─warmup_cosine_annealing.py # 每次迭代的学习率
|
||||
├─warmup_step_lr.py # 热身迭代学习率
|
||||
├─model_utils
|
||||
│ ├──config.py # 参数配置
|
||||
│ ├──device_adapter.py # 设备配置
|
||||
│ ├──local_adapter.py # 本地设备配置
|
||||
│ ├──moxing_adapter.py # modelarts设备配置
|
||||
├──create_imagenet2012_label.py # 转换推理数据
|
||||
├──default_config.yaml # 参数配置
|
||||
├──eval.py # 评估网络
|
||||
├──export.py # 转换ckpt至MINDIR格式
|
||||
├──postprogress.py # 310推理后处理
|
||||
├──train.py # 训练网络
|
||||
├──mindspore_hub_conf.py # MindSpore Hub接口
|
||||
```
|
||||
|
||||
### 1. 仓库克隆
|
||||
|
||||
```shell
|
||||
git clone https://gitee.com/neoming/resnext101-64x4d-mindspore.git
|
||||
cd resnext101-64x4d-mindspore/
|
||||
```
|
||||
|
||||
### 2. 数据下载和预处理
|
||||
|
||||
1. 下载ImageNet数据集
|
||||
2. 解压训练数据集和验证数据
|
||||
3. 训练和验证图像分别位于train /和val /目录下。 一个文件夹中的所有图像都具有相同的标签。
|
||||
|
||||
### 3. 训练(单卡)
|
||||
|
||||
可以通过python脚本开始训练:
|
||||
|
||||
```shell
|
||||
python train.py --data_path ~/imagenet/train/ --device_target Ascend --run_distribute=True
|
||||
```
|
||||
|
||||
或通过shell脚本开始训练:
|
||||
|
||||
```shell
|
||||
Ascend:
|
||||
# 分布式训练示例(8卡)
|
||||
bash scripts/run_distribute_train.sh RANK_TABLE_FILE DATA_PATH
|
||||
# 单机训练
|
||||
bash scripts/run_standalone_train.sh DEVICE_ID DATA_PATH
|
||||
GPU:
|
||||
# 分布式训练示例(8卡)
|
||||
bash scripts/run_distribute_train_for_gpu.sh DATA_PATH
|
||||
# 单机训练
|
||||
bash scripts/run_standalone_train_for_gpu.sh DEVICE_ID DATA_PATH
|
||||
```
|
||||
|
||||
### 4. 测试
|
||||
|
||||
您可以通过python脚本开始验证:
|
||||
|
||||
```shell
|
||||
python eval.py --data_path ~/imagenet/val/ --platform Ascend --checkpoint_file_path resnext.ckpt
|
||||
```
|
||||
|
||||
或通过shell脚本开始训练:
|
||||
|
||||
```shell
|
||||
# 评估
|
||||
bash scripts/run_eval.sh DEVICE_ID DATA_PATH CHECKPOINT_FILE_PATH DEVICE_TARGET
|
||||
```
|
||||
|
||||
## [推理过程](#contents)
|
||||
|
||||
### 用法
|
||||
|
||||
在执行推理之前,需要通过export.py导出mindir文件。
|
||||
目前仅可处理batch_Size为1。
|
||||
|
||||
## 模型导出
|
||||
|
||||
```shell
|
||||
python export.py --device_target [PLATFORM] --checkpoint_file_path [CKPT_PATH] --file_format [EXPORT_FORMAT]
|
||||
```
|
||||
|
||||
`checkpoint_file_path` 参数为必填项
|
||||
`device_target` 可选 ["Ascend", "GPU"]
|
||||
`file_format` 可选 ["AIR", "MINDIR"]
|
||||
|
||||
```shell
|
||||
#Ascend310 推理
|
||||
bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID]
|
||||
```
|
||||
|
||||
`DEVICE_ID` 可选,默认值为 0。
|
||||
|
||||
### 结果
|
||||
|
||||
推理结果保存在当前路径,可在acc.log中看到最终精度结果。
|
||||
|
||||
```log
|
||||
Total data:50000, top1 accuracy:0.79858, top5 accuracy:0.94716
|
||||
```
|
||||
|
||||
## 高级设置
|
||||
|
||||
### 超参设置
|
||||
|
||||
通过`src/default_config.yaml`文件进行设置,下面是ImageNet单卡实验的设置
|
||||
|
||||
```python
|
||||
"image_size": '224,224',
|
||||
"num_classes": 1000,
|
||||
|
||||
"lr": 0.05,
|
||||
"lr_scheduler": 'cosine_annealing',
|
||||
"lr_epochs": '30,60,90,120',
|
||||
"lr_gamma": 0.1,
|
||||
"eta_min": 0,
|
||||
"T_max": 150,
|
||||
"max_epoch": 150,
|
||||
"backbone": 'resnext101',
|
||||
"warmup_epochs": 1,
|
||||
|
||||
"weight_decay": 0.0001,
|
||||
"momentum": 0.9,
|
||||
"is_dynamic_loss_scale": 0,
|
||||
"loss_scale": 1024,
|
||||
"label_smooth": 1,
|
||||
"label_smooth_factor": 0.1,
|
||||
|
||||
"ckpt_interval": 1250,
|
||||
"ckpt_path": 'outputs/',
|
||||
"is_save_on_master": 1,
|
||||
|
||||
"rank": 0,
|
||||
"group_size": 1
|
||||
```
|
||||
|
||||
### 训练过程
|
||||
|
||||
训练脚本将会存储:
|
||||
|
||||
- checkpoints.
|
||||
- log.
|
||||
|
||||
## 性能
|
||||
|
||||
### 结果
|
||||
|
||||
通过运行训练脚本获得了以下结果。 要获得相同的结果,请遵循快速入门指南中的步骤。
|
||||
|
||||
#### 准确度
|
||||
|
||||
| **epochs** | Top1/Top5 |
|
||||
| :--------: | :-----------: |
|
||||
| 150 | 79.56%(TOP1)/94.68%(TOP5) |
|
||||
|
||||
#### 训练性能结果
|
||||
|
||||
| **NPUs** | train performance |
|
||||
| :------: | :---------------: |
|
||||
| 1 | 196.33image/sec |
|
||||
|
||||
### 310 推理性能
|
||||
|
||||
#### ResNeXt101 on ImageNet
|
||||
|
||||
| Parameters | Ascend |
|
||||
| ------------------- | --------------------------- |
|
||||
| Model Version | ResNeXt101 |
|
||||
| Resource | Ascend 310; OS Euler2.8 |
|
||||
| Uploaded Date | 22/06/2021 (month/day/year) |
|
||||
| MindSpore Version | 1.2.0 |
|
||||
| Dataset | ImageNet |
|
||||
| batch_size | 1 |
|
||||
| outputs | Accuracy |
|
||||
| Accuracy | TOP1: 79.85%, TOP5: 94.71% |
|
|
@ -1,14 +0,0 @@
|
|||
cmake_minimum_required(VERSION 3.14.1)
|
||||
project(Ascend310Infer)
|
||||
add_compile_definitions(_GLIBCXX_USE_CXX11_ABI=0)
|
||||
set(CMAKE_CXX_FLAGS "${CMAKE_CXX_FLAGS} -O0 -g -std=c++17 -Werror -Wall -fPIE -Wl,--allow-shlib-undefined")
|
||||
set(PROJECT_SRC_ROOT ${CMAKE_CURRENT_LIST_DIR}/)
|
||||
option(MINDSPORE_PATH "mindspore install path" "")
|
||||
include_directories(${MINDSPORE_PATH})
|
||||
include_directories(${MINDSPORE_PATH}/include)
|
||||
include_directories(${PROJECT_SRC_ROOT})
|
||||
find_library(MS_LIB libmindspore.so ${MINDSPORE_PATH}/lib)
|
||||
file(GLOB_RECURSE MD_LIB ${MINDSPORE_PATH}/_c_dataengine*)
|
||||
|
||||
add_executable(main src/main.cc src/utils.cc)
|
||||
target_link_libraries(main ${MS_LIB} ${MD_LIB} gflags)
|
|
@ -1,33 +0,0 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#ifndef MINDSPORE_INFERENCE_UTILS_H_
|
||||
#define MINDSPORE_INFERENCE_UTILS_H_
|
||||
|
||||
#include <sys/stat.h>
|
||||
#include <dirent.h>
|
||||
#include <vector>
|
||||
#include <string>
|
||||
#include <memory>
|
||||
#include "include/api/types.h"
|
||||
|
||||
DIR *OpenDir(std::string_view dirName);
|
||||
std::string RealPath(std::string_view path);
|
||||
mindspore::MSTensor ReadFileToTensor(const std::string &file);
|
||||
int WriteResult(const std::string& imageFile, const std::vector<mindspore::MSTensor> &outputs);
|
||||
std::vector<std::string> GetAllFiles(std::string dir_name);
|
||||
|
||||
#endif
|
|
@ -1,157 +0,0 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <sys/time.h>
|
||||
#include <gflags/gflags.h>
|
||||
#include <dirent.h>
|
||||
#include <iostream>
|
||||
#include <string>
|
||||
#include <algorithm>
|
||||
#include <iosfwd>
|
||||
#include <vector>
|
||||
#include <fstream>
|
||||
|
||||
#include "../inc/utils.h"
|
||||
#include "include/dataset/execute.h"
|
||||
#include "include/dataset/transforms.h"
|
||||
#include "include/dataset/vision.h"
|
||||
#include "include/dataset/vision_ascend.h"
|
||||
#include "include/api/types.h"
|
||||
#include "include/api/model.h"
|
||||
#include "include/api/serialization.h"
|
||||
#include "include/api/context.h"
|
||||
|
||||
using mindspore::Serialization;
|
||||
using mindspore::Model;
|
||||
using mindspore::Context;
|
||||
using mindspore::Status;
|
||||
using mindspore::ModelType;
|
||||
using mindspore::Graph;
|
||||
using mindspore::GraphCell;
|
||||
using mindspore::kSuccess;
|
||||
using mindspore::MSTensor;
|
||||
using mindspore::DataType;
|
||||
using mindspore::dataset::Execute;
|
||||
using mindspore::dataset::TensorTransform;
|
||||
using mindspore::dataset::vision::Decode;
|
||||
using mindspore::dataset::vision::Resize;
|
||||
using mindspore::dataset::vision::CenterCrop;
|
||||
using mindspore::dataset::vision::Normalize;
|
||||
using mindspore::dataset::vision::HWC2CHW;
|
||||
|
||||
DEFINE_string(model_path, "", "model path");
|
||||
DEFINE_string(dataset, "ImageNet", "dataset: ImageNet");
|
||||
DEFINE_string(dataset_path, ".", "dataset path");
|
||||
DEFINE_int32(device_id, 0, "device id");
|
||||
|
||||
int main(int argc, char **argv) {
|
||||
gflags::ParseCommandLineFlags(&argc, &argv, true);
|
||||
if (RealPath(FLAGS_model_path).empty()) {
|
||||
std::cout << "Invalid model" << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::transform(FLAGS_dataset.begin(), FLAGS_dataset.end(), FLAGS_dataset.begin(), ::tolower);
|
||||
|
||||
auto context = std::make_shared<Context>();
|
||||
auto ascend310_info = std::make_shared<mindspore::Ascend310DeviceInfo>();
|
||||
ascend310_info->SetDeviceID(FLAGS_device_id);
|
||||
context->MutableDeviceInfo().push_back(ascend310_info);
|
||||
|
||||
Graph graph;
|
||||
Status ret = Serialization::Load(FLAGS_model_path, ModelType::kMindIR, &graph);
|
||||
if (ret != kSuccess) {
|
||||
std::cout << "Load model failed." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
Model model;
|
||||
ret = model.Build(GraphCell(graph), context);
|
||||
if (ret != kSuccess) {
|
||||
std::cout << "ERROR: Build failed." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::vector<MSTensor> modelInputs = model.GetInputs();
|
||||
|
||||
auto all_files = GetAllFiles(FLAGS_dataset_path);
|
||||
if (all_files.empty()) {
|
||||
std::cout << "ERROR: no input data." << std::endl;
|
||||
return 1;
|
||||
}
|
||||
|
||||
std::shared_ptr<TensorTransform> decode(new Decode());
|
||||
std::shared_ptr<TensorTransform> resize(new Resize({256, 256}));
|
||||
std::shared_ptr<TensorTransform> centerCrop(new CenterCrop({224, 224}));
|
||||
std::shared_ptr<TensorTransform> normImageNet(new Normalize({123.675, 116.28, 103.53}, {58.395, 57.12, 57.375}));
|
||||
std::shared_ptr<TensorTransform> hwc2chw(new HWC2CHW());
|
||||
|
||||
mindspore::dataset::Execute transformImageNet({decode, resize, centerCrop, normImageNet, hwc2chw});
|
||||
|
||||
std::map<double, double> costTime_map;
|
||||
|
||||
size_t size = all_files.size();
|
||||
for (size_t i = 0; i < size; ++i) {
|
||||
struct timeval start;
|
||||
struct timeval end;
|
||||
double startTime_ms;
|
||||
double endTime_ms;
|
||||
std::vector<MSTensor> inputs;
|
||||
std::vector<MSTensor> outputs;
|
||||
|
||||
std::cout << "Start predict input files:" << all_files[i] << std::endl;
|
||||
mindspore::MSTensor image = ReadFileToTensor(all_files[i]);
|
||||
|
||||
if (FLAGS_dataset.compare("imagenet") == 0) {
|
||||
transformImageNet(image, &image);
|
||||
} else {
|
||||
std::cout << "unsupported dataset ...";
|
||||
return 1;
|
||||
}
|
||||
|
||||
inputs.emplace_back(modelInputs[0].Name(), modelInputs[0].DataType(), modelInputs[0].Shape(),
|
||||
image.Data().get(), image.DataSize());
|
||||
|
||||
gettimeofday(&start, NULL);
|
||||
model.Predict(inputs, &outputs);
|
||||
gettimeofday(&end, NULL);
|
||||
|
||||
startTime_ms = (1.0 * start.tv_sec * 1000000 + start.tv_usec) / 1000;
|
||||
endTime_ms = (1.0 * end.tv_sec * 1000000 + end.tv_usec) / 1000;
|
||||
costTime_map.insert(std::pair<double, double>(startTime_ms, endTime_ms));
|
||||
WriteResult(all_files[i], outputs);
|
||||
}
|
||||
double average = 0.0;
|
||||
int infer_cnt = 0;
|
||||
char tmpCh[256] = {0};
|
||||
for (auto iter = costTime_map.begin(); iter != costTime_map.end(); iter++) {
|
||||
double diff = 0.0;
|
||||
diff = iter->second - iter->first;
|
||||
average += diff;
|
||||
infer_cnt++;
|
||||
}
|
||||
|
||||
average = average/infer_cnt;
|
||||
|
||||
snprintf(tmpCh, sizeof(tmpCh), "NN inference cost average time: %4.3f ms of infer_count %d\n", average, infer_cnt);
|
||||
std::cout << "NN inference cost average time: "<< average << "ms of infer_count " << infer_cnt << std::endl;
|
||||
std::string file_name = "./time_Result" + std::string("/test_perform_static.txt");
|
||||
std::ofstream file_stream(file_name.c_str(), std::ios::trunc);
|
||||
file_stream << tmpCh;
|
||||
file_stream.close();
|
||||
costTime_map.clear();
|
||||
return 0;
|
||||
}
|
|
@ -1,145 +0,0 @@
|
|||
/**
|
||||
* Copyright 2021 Huawei Technologies Co., Ltd
|
||||
*
|
||||
* Licensed under the Apache License, Version 2.0 (the "License");
|
||||
* you may not use this file except in compliance with the License.
|
||||
* You may obtain a copy of the License at
|
||||
*
|
||||
* http://www.apache.org/licenses/LICENSE-2.0
|
||||
*
|
||||
* Unless required by applicable law or agreed to in writing, software
|
||||
* distributed under the License is distributed on an "AS IS" BASIS,
|
||||
* WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
* See the License for the specific language governing permissions and
|
||||
* limitations under the License.
|
||||
*/
|
||||
|
||||
#include <fstream>
|
||||
#include <algorithm>
|
||||
#include <iostream>
|
||||
#include "inc/utils.h"
|
||||
|
||||
using mindspore::MSTensor;
|
||||
using mindspore::DataType;
|
||||
|
||||
std::vector<std::string> GetAllFiles(std::string dirName) {
|
||||
struct dirent *filename;
|
||||
DIR *dir = OpenDir(dirName);
|
||||
if (dir == nullptr) {
|
||||
return {};
|
||||
}
|
||||
std::vector<std::string> dirs;
|
||||
std::vector<std::string> files;
|
||||
while ((filename = readdir(dir)) != nullptr) {
|
||||
std::string dName = std::string(filename->d_name);
|
||||
if (dName == "." || dName == "..") {
|
||||
continue;
|
||||
} else if (filename->d_type == DT_DIR) {
|
||||
dirs.emplace_back(std::string(dirName) + "/" + filename->d_name);
|
||||
} else if (filename->d_type == DT_REG) {
|
||||
files.emplace_back(std::string(dirName) + "/" + filename->d_name);
|
||||
} else {
|
||||
continue;
|
||||
}
|
||||
}
|
||||
|
||||
for (auto d : dirs) {
|
||||
dir = OpenDir(d);
|
||||
while ((filename = readdir(dir)) != nullptr) {
|
||||
std::string dName = std::string(filename->d_name);
|
||||
if (dName == "." || dName == ".." || filename->d_type != DT_REG) {
|
||||
continue;
|
||||
}
|
||||
files.emplace_back(std::string(d) + "/" + filename->d_name);
|
||||
}
|
||||
}
|
||||
std::sort(files.begin(), files.end());
|
||||
for (auto &f : files) {
|
||||
std::cout << "image file: " << f << std::endl;
|
||||
}
|
||||
return files;
|
||||
}
|
||||
|
||||
int WriteResult(const std::string& imageFile, const std::vector<MSTensor> &outputs) {
|
||||
std::string homePath = "./result_Files";
|
||||
for (size_t i = 0; i < outputs.size(); ++i) {
|
||||
size_t outputSize;
|
||||
std::shared_ptr<const void> netOutput;
|
||||
netOutput = outputs[i].Data();
|
||||
outputSize = outputs[i].DataSize();
|
||||
int pos = imageFile.rfind('/');
|
||||
std::string fileName(imageFile, pos + 1);
|
||||
fileName.replace(fileName.find('.'), fileName.size() - fileName.find('.'), '_' + std::to_string(i) + ".bin");
|
||||
std::string outFileName = homePath + "/" + fileName;
|
||||
FILE *outputFile = fopen(outFileName.c_str(), "wb");
|
||||
fwrite(netOutput.get(), outputSize, sizeof(char), outputFile);
|
||||
fclose(outputFile);
|
||||
outputFile = nullptr;
|
||||
}
|
||||
return 0;
|
||||
}
|
||||
|
||||
mindspore::MSTensor ReadFileToTensor(const std::string &file) {
|
||||
if (file.empty()) {
|
||||
std::cout << "Pointer file is nullptr" << std::endl;
|
||||
return mindspore::MSTensor();
|
||||
}
|
||||
|
||||
std::ifstream ifs(file);
|
||||
if (!ifs.good()) {
|
||||
std::cout << "File: " << file << " is not exist" << std::endl;
|
||||
return mindspore::MSTensor();
|
||||
}
|
||||
|
||||
if (!ifs.is_open()) {
|
||||
std::cout << "File: " << file << "open failed" << std::endl;
|
||||
return mindspore::MSTensor();
|
||||
}
|
||||
|
||||
ifs.seekg(0, std::ios::end);
|
||||
size_t size = ifs.tellg();
|
||||
mindspore::MSTensor buffer(file, mindspore::DataType::kNumberTypeUInt8, {static_cast<int64_t>(size)}, nullptr, size);
|
||||
|
||||
ifs.seekg(0, std::ios::beg);
|
||||
ifs.read(reinterpret_cast<char *>(buffer.MutableData()), size);
|
||||
ifs.close();
|
||||
|
||||
return buffer;
|
||||
}
|
||||
|
||||
|
||||
DIR *OpenDir(std::string_view dirName) {
|
||||
if (dirName.empty()) {
|
||||
std::cout << " dirName is null ! " << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
std::string realPath = RealPath(dirName);
|
||||
struct stat s;
|
||||
lstat(realPath.c_str(), &s);
|
||||
if (!S_ISDIR(s.st_mode)) {
|
||||
std::cout << "dirName is not a valid directory !" << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
DIR *dir;
|
||||
dir = opendir(realPath.c_str());
|
||||
if (dir == nullptr) {
|
||||
std::cout << "Can not open dir " << dirName << std::endl;
|
||||
return nullptr;
|
||||
}
|
||||
std::cout << "Successfully opened the dir " << dirName << std::endl;
|
||||
return dir;
|
||||
}
|
||||
|
||||
std::string RealPath(std::string_view path) {
|
||||
char realPathMem[PATH_MAX] = {0};
|
||||
char *realPathRet = nullptr;
|
||||
realPathRet = realpath(path.data(), realPathMem);
|
||||
if (realPathRet == nullptr) {
|
||||
std::cout << "File: " << path << " is not exist.";
|
||||
return "";
|
||||
}
|
||||
|
||||
std::string realPath(realPathMem);
|
||||
std::cout << path << " realpath is: " << realPath << std::endl;
|
||||
return realPath;
|
||||
}
|
|
@ -1,70 +0,0 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unless you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# Url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# Path for local
|
||||
run_distribute: False
|
||||
enable_profiling: False
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path/"
|
||||
device_target: 'Ascend'
|
||||
checkpoint_path: './checkpoint/'
|
||||
checkpoint_file_path: ''
|
||||
|
||||
# ==============================================================================
|
||||
# Training options
|
||||
image_size: [224,224]
|
||||
num_classes: 1000
|
||||
batch_size: 1
|
||||
|
||||
lr: 0.4
|
||||
lr_scheduler: 'cosine_annealing'
|
||||
lr_epochs: [30,60,90,120]
|
||||
lr_gamma: 0.1
|
||||
eta_min: 0
|
||||
T_max: 150
|
||||
max_epoch: 150
|
||||
warmup_epochs: 1
|
||||
|
||||
weight_decay: 0.0001
|
||||
momentum: 0.9
|
||||
is_dynamic_loss_scale: 0
|
||||
loss_scale: 1024
|
||||
label_smooth: 1
|
||||
label_smooth_factor: 0.1
|
||||
per_batch_size: 128
|
||||
|
||||
ckpt_interval: 5
|
||||
ckpt_save_max: 5
|
||||
is_save_on_master: 1
|
||||
rank_save_ckpt_flag: 0
|
||||
outputs_dir: ""
|
||||
log_path: './output_log'
|
||||
|
||||
# Export options
|
||||
device_id: 0
|
||||
width: 224
|
||||
height: 224
|
||||
file_name: "resnext101"
|
||||
file_format: "MINDIR"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: 'Whether training on modelarts, default: False'
|
||||
data_url: 'Dataset url for obs'
|
||||
train_url: 'Training output url for obs'
|
||||
checkpoint_url: 'The location of checkpoint for obs'
|
||||
data_path: 'Dataset path for local'
|
||||
output_path: 'Training output path for local'
|
||||
load_path: 'The location of checkpoint for obs'
|
||||
device_target: 'Target device type, available: [Ascend, GPU, CPU]'
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
num_classes: 'Class for dataset'
|
||||
batch_size: "Batch size for training and evaluation"
|
||||
epoch_size: "Total training epochs."
|
||||
keep_checkpoint_max: "keep the last keep_checkpoint_max checkpoint"
|
||||
checkpoint_path: "The location of the checkpoint file."
|
||||
checkpoint_file_path: "The location of the checkpoint file."
|
|
@ -1,211 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Eval"""
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
import glob
|
||||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
|
||||
from mindspore import Tensor, context
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.communication.management import init, get_rank, get_group_size, release
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore.common import dtype as mstype
|
||||
|
||||
from src.utils.logging import get_logger
|
||||
from src.utils.auto_mixed_precision import auto_mixed_precision
|
||||
from src.utils.var_init import load_pretrain_model
|
||||
from src.image_classification import get_network
|
||||
from src.dataset import classification_dataset
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
|
||||
class ParameterReduce(nn.Cell):
|
||||
"""ParameterReduce"""
|
||||
def __init__(self):
|
||||
super(ParameterReduce, self).__init__()
|
||||
self.cast = P.Cast()
|
||||
self.reduce = P.AllReduce()
|
||||
|
||||
def construct(self, x):
|
||||
one = self.cast(F.scalar_to_array(1.0), mstype.float32)
|
||||
out = x * one
|
||||
ret = self.reduce(out)
|
||||
return ret
|
||||
|
||||
|
||||
def set_parameters():
|
||||
"""set_parameters"""
|
||||
# init distributed
|
||||
if config.run_distribute:
|
||||
if config.device_target == "Ascend":
|
||||
init()
|
||||
elif config.device_target == "GPU":
|
||||
init("nccl")
|
||||
config.rank = get_rank()
|
||||
config.group_size = get_group_size()
|
||||
else:
|
||||
config.rank = 0
|
||||
config.group_size = 1
|
||||
|
||||
config.outputs_dir = os.path.join(config.log_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
|
||||
config.logger = get_logger(config.outputs_dir, config.rank)
|
||||
return config
|
||||
|
||||
|
||||
def get_top5_acc(top5_arg, gt_class):
|
||||
sub_count = 0
|
||||
for top5, gt in zip(top5_arg, gt_class):
|
||||
if gt in top5:
|
||||
sub_count += 1
|
||||
return sub_count
|
||||
|
||||
|
||||
def get_result(model, top1_correct, top5_correct, img_tot):
|
||||
"""calculate top1 and top5 value."""
|
||||
results = [[top1_correct], [top5_correct], [img_tot]]
|
||||
config.logger.info('before results=%s', results)
|
||||
if config.run_distribute:
|
||||
model_md5 = model.replace('/', '')
|
||||
tmp_dir = '/cache'
|
||||
if not os.path.exists(tmp_dir):
|
||||
os.mkdir(tmp_dir)
|
||||
top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(config.rank, model_md5)
|
||||
top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(config.rank, model_md5)
|
||||
img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(config.rank, model_md5)
|
||||
np.save(top1_correct_npy, top1_correct)
|
||||
np.save(top5_correct_npy, top5_correct)
|
||||
np.save(img_tot_npy, img_tot)
|
||||
while True:
|
||||
rank_ok = True
|
||||
for other_rank in range(config.group_size):
|
||||
top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(other_rank, model_md5)
|
||||
top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(other_rank, model_md5)
|
||||
img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(other_rank, model_md5)
|
||||
if not os.path.exists(top1_correct_npy) or not os.path.exists(top5_correct_npy) or \
|
||||
not os.path.exists(img_tot_npy):
|
||||
rank_ok = False
|
||||
if rank_ok:
|
||||
break
|
||||
|
||||
top1_correct_all = 0
|
||||
top5_correct_all = 0
|
||||
img_tot_all = 0
|
||||
for other_rank in range(config.group_size):
|
||||
top1_correct_npy = '/cache/top1_rank_{}_{}.npy'.format(other_rank, model_md5)
|
||||
top5_correct_npy = '/cache/top5_rank_{}_{}.npy'.format(other_rank, model_md5)
|
||||
img_tot_npy = '/cache/img_tot_rank_{}_{}.npy'.format(other_rank, model_md5)
|
||||
top1_correct_all += np.load(top1_correct_npy)
|
||||
top5_correct_all += np.load(top5_correct_npy)
|
||||
img_tot_all += np.load(img_tot_npy)
|
||||
results = [[top1_correct_all], [top5_correct_all], [img_tot_all]]
|
||||
results = np.array(results)
|
||||
else:
|
||||
results = np.array(results)
|
||||
|
||||
config.logger.info('after results=%s', results)
|
||||
return results
|
||||
|
||||
@moxing_wrapper()
|
||||
def test():
|
||||
"""test"""
|
||||
set_parameters()
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
|
||||
device_target=config.device_target, save_graphs=False)
|
||||
if os.getenv('DEVICE_ID', "not_set").isdigit():
|
||||
context.set_context(device_id=int(os.getenv('DEVICE_ID')))
|
||||
|
||||
# init distributed
|
||||
if config.run_distribute:
|
||||
parallel_mode = ParallelMode.DATA_PARALLEL
|
||||
context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size,
|
||||
gradients_mean=True)
|
||||
|
||||
config.logger.save_args(config)
|
||||
|
||||
# network
|
||||
config.logger.important_info('start create network')
|
||||
if os.path.isdir(config.checkpoint_file_path):
|
||||
models = list(glob.glob(os.path.join(config.checkpoint_file_path, '*.ckpt')))
|
||||
print(models)
|
||||
if config.graph_ckpt:
|
||||
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('-')[-1].split('_')[0])
|
||||
else:
|
||||
f = lambda x: -1 * int(os.path.splitext(os.path.split(x)[-1])[0].split('_')[-1])
|
||||
config.models = sorted(models, key=f)
|
||||
else:
|
||||
config.models = [config.checkpoint_file_path,]
|
||||
|
||||
for model in config.models:
|
||||
de_dataset = classification_dataset(config.data_path, image_size=config.image_size,
|
||||
per_batch_size=config.per_batch_size,
|
||||
max_epoch=1, rank=config.rank, group_size=config.group_size,
|
||||
mode='eval')
|
||||
eval_dataloader = de_dataset.create_tuple_iterator(output_numpy=True, num_epochs=1)
|
||||
network = get_network(num_classes=config.num_classes, platform=config.device_target)
|
||||
|
||||
load_pretrain_model(model, network, config)
|
||||
|
||||
img_tot = 0
|
||||
top1_correct = 0
|
||||
top5_correct = 0
|
||||
if config.device_target == "Ascend":
|
||||
network.to_float(mstype.float16)
|
||||
else:
|
||||
auto_mixed_precision(network)
|
||||
network.set_train(False)
|
||||
t_end = time.time()
|
||||
it = 0
|
||||
for data, gt_classes in eval_dataloader:
|
||||
output = network(Tensor(data, mstype.float32))
|
||||
output = output.asnumpy()
|
||||
|
||||
top1_output = np.argmax(output, (-1))
|
||||
top5_output = np.argsort(output)[:, -5:]
|
||||
|
||||
t1_correct = np.equal(top1_output, gt_classes).sum()
|
||||
top1_correct += t1_correct
|
||||
top5_correct += get_top5_acc(top5_output, gt_classes)
|
||||
img_tot += config.per_batch_size
|
||||
|
||||
if config.rank == 0 and it == 0:
|
||||
t_end = time.time()
|
||||
it = 1
|
||||
if config.rank == 0:
|
||||
time_used = time.time() - t_end
|
||||
fps = (img_tot - config.per_batch_size) * config.group_size / time_used
|
||||
config.logger.info('Inference Performance: {:.2f} img/sec'.format(fps))
|
||||
results = get_result(model, top1_correct, top5_correct, img_tot)
|
||||
top1_correct = results[0, 0]
|
||||
top5_correct = results[1, 0]
|
||||
img_tot = results[2, 0]
|
||||
acc1 = 100.0 * top1_correct / img_tot
|
||||
acc5 = 100.0 * top5_correct / img_tot
|
||||
config.logger.info('after allreduce eval: top1_correct={}, tot={},'
|
||||
'acc={:.2f}%(TOP1)'.format(top1_correct, img_tot, acc1))
|
||||
config.logger.info('after allreduce eval: top5_correct={}, tot={},'
|
||||
'acc={:.2f}%(TOP5)'.format(top5_correct, img_tot, acc5))
|
||||
if config.run_distribute:
|
||||
release()
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
test()
|
|
@ -1,47 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
resnext export mindir.
|
||||
"""
|
||||
import argparse
|
||||
import numpy as np
|
||||
from mindspore import context, Tensor, load_checkpoint, load_param_into_net, export
|
||||
from src.model_utils.config import config
|
||||
from src.image_classification import get_network
|
||||
|
||||
parser = argparse.ArgumentParser(description='checkpoint export')
|
||||
parser.add_argument("--device_id", type=int, default=0, help="Device id")
|
||||
parser.add_argument("--batch_size", type=int, default=1, help="batch size")
|
||||
parser.add_argument("--checkpoint_file_path", type=str, required=True, help="Checkpoint file path.")
|
||||
parser.add_argument('--width', type=int, default=224, help='input width')
|
||||
parser.add_argument('--height', type=int, default=224, help='input height')
|
||||
parser.add_argument("--file_name", type=str, default="resnext101", help="output file name.")
|
||||
parser.add_argument("--file_format", type=str, choices=["AIR", "MINDIR"], default="MINDIR", help="file format")
|
||||
parser.add_argument("--device_target", type=str, default="Ascend",
|
||||
choices=["Ascend", "GPU", "CPU"], help="device target (default: Ascend)")
|
||||
args = parser.parse_args()
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||
if config.device_target == "Ascend":
|
||||
context.set_context(device_id=config.device_id)
|
||||
|
||||
if __name__ == '__main__':
|
||||
net = get_network(num_classes=config.num_classes, platform=config.device_target)
|
||||
|
||||
param_dict = load_checkpoint(args.checkpoint_file_path)
|
||||
load_param_into_net(net, param_dict)
|
||||
input_shp = [config.batch_size, 3, config.height, config.width]
|
||||
input_array = Tensor(np.random.uniform(-1.0, 1.0, size=input_shp).astype(np.float32))
|
||||
export(net, input_array, file_name=config.file_name, file_format=config.file_format)
|
|
@ -1,48 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""post process for 310 inference"""
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
import numpy as np
|
||||
|
||||
batch_size = 1
|
||||
parser = argparse.ArgumentParser(description="resnet inference")
|
||||
parser.add_argument("--result_path", type=str, required=True, help="result files path.")
|
||||
parser.add_argument("--label_path", type=str, required=True, help="image file path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
def get_result(result_path, label_path):
|
||||
files = os.listdir(result_path)
|
||||
with open(label_path, "r") as label:
|
||||
labels = json.load(label)
|
||||
|
||||
top1 = 0
|
||||
top5 = 0
|
||||
total_data = len(files)
|
||||
for file in files:
|
||||
img_ids_name = file.split('_0.')[0]
|
||||
data_path = os.path.join(result_path, img_ids_name + "_0.bin")
|
||||
result = np.fromfile(data_path, dtype=np.float16).reshape(1, 1000) #reshape(batch_size, num_classes)
|
||||
predict = np.argsort(-result[0], axis=-1)
|
||||
if labels[img_ids_name+".JPEG"] == predict[0]:
|
||||
top1 += 1
|
||||
if labels[img_ids_name+".JPEG"] in predict[:5]:
|
||||
top5 += 1
|
||||
|
||||
print(f"Total data: {total_data}, top1 accuracy: {top1/total_data}, top5 accuracy: {top5/total_data}.")
|
||||
|
||||
if __name__ == '__main__':
|
||||
get_result(args.result_path, args.label_path)
|
|
@ -1,58 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
DATA_DIR=$2
|
||||
export RANK_TABLE_FILE=$1
|
||||
export RANK_SIZE=8
|
||||
export HCCL_CONNECT_TIMEOUT=600
|
||||
echo "hccl connect time out has changed to 600 second"
|
||||
PATH_CHECKPOINT=""
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
PATH_CHECKPOINT=$3
|
||||
fi
|
||||
|
||||
cores=`cat /proc/cpuinfo|grep "processor" |wc -l`
|
||||
echo "the number of logical core" $cores
|
||||
avg_core_per_rank=`expr $cores \/ $RANK_SIZE`
|
||||
core_gap=`expr $avg_core_per_rank \- 1`
|
||||
echo "avg_core_per_rank" $avg_core_per_rank
|
||||
echo "core_gap" $core_gap
|
||||
for((i=0;i<RANK_SIZE;i++))
|
||||
do
|
||||
start=`expr $i \* $avg_core_per_rank`
|
||||
export DEVICE_ID=$i
|
||||
export RANK_ID=$i
|
||||
export DEPLOY_MODE=0
|
||||
export GE_USE_STATIC_MEMORY=1
|
||||
end=`expr $start \+ $core_gap`
|
||||
cmdopt=$start"-"$end
|
||||
|
||||
rm -rf LOG$i
|
||||
mkdir ./LOG$i
|
||||
cp *.py ./LOG$i
|
||||
cd ./LOG$i || exit
|
||||
echo "start training for rank $i, device $DEVICE_ID"
|
||||
|
||||
env > env.log
|
||||
taskset -c $cmdopt python ../train.py \
|
||||
--run_distribute=True \
|
||||
--device_id=$DEVICE_ID \
|
||||
--checkpoint_file_path=$PATH_CHECKPOINT \
|
||||
--data_path=$DATA_DIR \
|
||||
--output_path './output' > log.txt 2>&1 &
|
||||
cd ../
|
||||
done
|
|
@ -1,31 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
DATA_DIR=$1
|
||||
export RANK_SIZE=8
|
||||
PATH_CHECKPOINT=""
|
||||
if [ $# == 2 ]
|
||||
then
|
||||
PATH_CHECKPOINT=$2
|
||||
fi
|
||||
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py \
|
||||
--run_distribute=True \
|
||||
--device_target="GPU" \
|
||||
--checkpoint_file_path=$PATH_CHECKPOINT \
|
||||
--data_path=$DATA_DIR \
|
||||
--output_path './output' > log.txt 2>&1 &
|
|
@ -1,29 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
export DEVICE_ID=$1
|
||||
DATA_DIR=$2
|
||||
PATH_CHECKPOINT=$3
|
||||
PLATFORM=Ascend
|
||||
if [ $# == 4 ]
|
||||
then
|
||||
PLATFORM=$4
|
||||
fi
|
||||
|
||||
python eval.py \
|
||||
--checkpoint_file_path=$PATH_CHECKPOINT \
|
||||
--device_target=$PLATFORM \
|
||||
--data_path=$DATA_DIR > log.txt 2>&1 &
|
|
@ -1,99 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
if [[ $# -lt 2 || $# -gt 3 ]]; then
|
||||
echo "Usage: bash run_infer_310.sh [MINDIR_PATH] [DATA_PATH] [DEVICE_ID]
|
||||
DEVICE_ID is optional, it can be set by environment variable device_id, otherwise the value is zero"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
model=$(get_real_path $1)
|
||||
data_path=$(get_real_path $2)
|
||||
|
||||
device_id=0
|
||||
if [ $# == 3 ]; then
|
||||
device_id=$3
|
||||
fi
|
||||
|
||||
echo "mindir name: "$model
|
||||
echo "dataset path: "$data_path
|
||||
echo "device id: "$device_id
|
||||
|
||||
export ASCEND_HOME=/usr/local/Ascend/
|
||||
if [ -d ${ASCEND_HOME}/ascend-toolkit ]; then
|
||||
export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/ascend-toolkit/latest/atc/bin:$PATH
|
||||
export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/ascend-toolkit/latest/atc/lib64:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH
|
||||
export TBE_IMPL_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp/op_impl/built-in/ai_core/tbe
|
||||
export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:${TBE_IMPL_PATH}:$ASCEND_HOME/ascend-toolkit/latest/fwkacllib/python/site-packages:$PYTHONPATH
|
||||
export ASCEND_OPP_PATH=$ASCEND_HOME/ascend-toolkit/latest/opp
|
||||
else
|
||||
export PATH=$ASCEND_HOME/fwkacllib/bin:$ASCEND_HOME/fwkacllib/ccec_compiler/bin:$ASCEND_HOME/atc/ccec_compiler/bin:$ASCEND_HOME/atc/bin:$PATH
|
||||
export LD_LIBRARY_PATH=$ASCEND_HOME/fwkacllib/lib64:/usr/local/lib:$ASCEND_HOME/atc/lib64:$ASCEND_HOME/acllib/lib64:$ASCEND_HOME/driver/lib64:$ASCEND_HOME/add-ons:$LD_LIBRARY_PATH
|
||||
export PYTHONPATH=$ASCEND_HOME/fwkacllib/python/site-packages:$ASCEND_HOME/atc/python/site-packages:$PYTHONPATH
|
||||
export ASCEND_OPP_PATH=$ASCEND_HOME/opp
|
||||
fi
|
||||
|
||||
function compile_app()
|
||||
{
|
||||
cd ../ascend310_infer/ || exit
|
||||
if [ -f "Makefile" ]; then
|
||||
make clean
|
||||
fi
|
||||
bash build.sh &> build.log
|
||||
}
|
||||
|
||||
function infer()
|
||||
{
|
||||
cd - || exit
|
||||
if [ -d result_Files ]; then
|
||||
rm -rf ./result_Files
|
||||
fi
|
||||
if [ -d time_Result ]; then
|
||||
rm -rf ./time_Result
|
||||
fi
|
||||
mkdir result_Files
|
||||
mkdir time_Result
|
||||
../ascend310_infer/main --model_path=$model --dataset_path=$data_path --device_id=$device_id &> infer.log
|
||||
}
|
||||
|
||||
function cal_acc()
|
||||
{
|
||||
python3.7 ../create_imagenet2012_label.py --img_path=$data_path
|
||||
python3.7 ../postprocess.py --result_path=./result_Files --label_path=./imagenet_label.json &> acc.log &
|
||||
}
|
||||
|
||||
compile_app
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "compile app code failed"
|
||||
exit 1
|
||||
fi
|
||||
infer
|
||||
if [ $? -ne 0 ]; then
|
||||
echo " execute inference failed"
|
||||
exit 1
|
||||
fi
|
||||
cal_acc
|
||||
if [ $? -ne 0 ]; then
|
||||
echo "calculate accuracy failed"
|
||||
exit 1
|
||||
fi
|
|
@ -1,30 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
export DEVICE_ID=$1
|
||||
DATA_DIR=$2
|
||||
PATH_CHECKPOINT=""
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
PATH_CHECKPOINT=$3
|
||||
fi
|
||||
|
||||
python train.py \
|
||||
--device_id=$DEVICE_ID \
|
||||
--checkpoint_file_path=$PATH_CHECKPOINT \
|
||||
--data_path=$DATA_DIR \
|
||||
--output_path './output' > log.txt 2>&1 &
|
||||
|
|
@ -1,30 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
export DEVICE_ID=$1
|
||||
DATA_DIR=$2
|
||||
PATH_CHECKPOINT=""
|
||||
if [ $# == 3 ]
|
||||
then
|
||||
PATH_CHECKPOINT=$3
|
||||
fi
|
||||
|
||||
python train.py \
|
||||
--checkpoint_file_path=$PATH_CHECKPOINT \
|
||||
--device_target="GPU" \
|
||||
--data_path=$DATA_DIR \
|
||||
--output_path './output' > log.txt 2>&1 &
|
||||
|
|
@ -1,16 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""resnext"""
|
||||
from .resnext import *
|
|
@ -1,292 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
ResNext
|
||||
"""
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops.operations import TensorAdd
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.common.initializer import TruncatedNormal
|
||||
|
||||
from src.utils.cunstom_op import SEBlock, GroupConv
|
||||
|
||||
|
||||
__all__ = ['resnext50', 'resnext101']
|
||||
|
||||
|
||||
def weight_variable(shape, factor=0.1):
|
||||
return TruncatedNormal(0.02)
|
||||
|
||||
|
||||
def conv7x7(in_channels, out_channels, stride=1, padding=3, has_bias=False, groups=1):
|
||||
return nn.Conv2d(in_channels, out_channels, kernel_size=7, stride=stride, has_bias=has_bias,
|
||||
padding=padding, pad_mode="pad", group=groups)
|
||||
|
||||
|
||||
def conv3x3(in_channels, out_channels, stride=1, padding=1, has_bias=False, groups=1):
|
||||
return nn.Conv2d(in_channels, out_channels, kernel_size=3, stride=stride, has_bias=has_bias,
|
||||
padding=padding, pad_mode="pad", group=groups)
|
||||
|
||||
|
||||
def conv1x1(in_channels, out_channels, stride=1, padding=0, has_bias=False, groups=1):
|
||||
return nn.Conv2d(in_channels, out_channels, kernel_size=1, stride=stride, has_bias=has_bias,
|
||||
padding=padding, pad_mode="pad", group=groups)
|
||||
|
||||
|
||||
class _DownSample(nn.Cell):
|
||||
"""
|
||||
Downsample for ResNext-ResNet.
|
||||
|
||||
Args:
|
||||
in_channels (int): Input channels.
|
||||
out_channels (int): Output channels.
|
||||
stride (int): Stride size for the 1*1 convolutional layer.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>>DownSample(32, 64, 2)
|
||||
"""
|
||||
|
||||
def __init__(self, in_channels, out_channels, stride):
|
||||
super(_DownSample, self).__init__()
|
||||
self.conv = conv1x1(in_channels, out_channels,
|
||||
stride=stride, padding=0)
|
||||
self.bn = nn.BatchNorm2d(out_channels)
|
||||
|
||||
def construct(self, x):
|
||||
out = self.conv(x)
|
||||
out = self.bn(out)
|
||||
return out
|
||||
|
||||
|
||||
class BasicBlock(nn.Cell):
|
||||
"""
|
||||
ResNeXt basic block definition.
|
||||
|
||||
Args:
|
||||
in_channels (int): Input channels.
|
||||
out_channels (int): Output channels.
|
||||
stride (int): Stride size for the first convolutional layer. Default: 1.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
|
||||
Examples:
|
||||
>>>BasicBlock(32, 256, stride=2)
|
||||
"""
|
||||
expansion = 1
|
||||
|
||||
def __init__(self, in_channels, out_channels, stride=1, down_sample=None, use_se=False,
|
||||
platform="Ascend", **kwargs):
|
||||
super(BasicBlock, self).__init__()
|
||||
self.conv1 = conv3x3(in_channels, out_channels, stride=stride)
|
||||
self.bn1 = nn.BatchNorm2d(out_channels)
|
||||
self.relu = P.ReLU()
|
||||
self.conv2 = conv3x3(out_channels, out_channels, stride=1)
|
||||
self.bn2 = nn.BatchNorm2d(out_channels)
|
||||
|
||||
self.use_se = use_se
|
||||
if self.use_se:
|
||||
self.se = SEBlock(out_channels)
|
||||
|
||||
self.down_sample_flag = False
|
||||
if down_sample is not None:
|
||||
self.down_sample = down_sample
|
||||
self.down_sample_flag = True
|
||||
|
||||
self.add = TensorAdd()
|
||||
|
||||
def construct(self, x):
|
||||
identity = x
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
|
||||
if self.use_se:
|
||||
out = self.se(out)
|
||||
|
||||
if self.down_sample_flag:
|
||||
identity = self.down_sample(x)
|
||||
|
||||
out = self.add(out, identity)
|
||||
out = self.relu(out)
|
||||
return out
|
||||
|
||||
|
||||
class Bottleneck(nn.Cell):
|
||||
"""
|
||||
ResNeXt Bottleneck block definition.
|
||||
|
||||
Args:
|
||||
in_channels (int): Input channels.
|
||||
out_channels (int): Output channels.
|
||||
stride (int): Stride size for the initial convolutional layer. Default: 1.
|
||||
|
||||
Returns:
|
||||
Tensor, the ResNet unit's output.
|
||||
|
||||
Examples:
|
||||
>>>Bottleneck(3, 256, stride=2)
|
||||
"""
|
||||
expansion = 4
|
||||
|
||||
def __init__(self, in_channels, out_channels, stride=1, down_sample=None,
|
||||
base_width=64, groups=1, use_se=False, platform="Ascend", **kwargs):
|
||||
super(Bottleneck, self).__init__()
|
||||
|
||||
width = int(out_channels * (base_width / 64.0)) * groups
|
||||
self.groups = groups
|
||||
self.conv1 = conv1x1(in_channels, width, stride=1)
|
||||
self.bn1 = nn.BatchNorm2d(width)
|
||||
self.relu = P.ReLU()
|
||||
|
||||
self.conv3x3s = nn.CellList()
|
||||
|
||||
if platform == "GPU":
|
||||
self.conv2 = nn.Conv2d(
|
||||
width, width, 3, stride, pad_mode='pad', padding=1, group=groups)
|
||||
else:
|
||||
self.conv2 = GroupConv(
|
||||
width, width, 3, stride, pad=1, groups=groups)
|
||||
|
||||
self.bn2 = nn.BatchNorm2d(width)
|
||||
self.conv3 = conv1x1(width, out_channels * self.expansion, stride=1)
|
||||
self.bn3 = nn.BatchNorm2d(out_channels * self.expansion)
|
||||
|
||||
self.use_se = use_se
|
||||
if self.use_se:
|
||||
self.se = SEBlock(out_channels * self.expansion)
|
||||
|
||||
self.down_sample_flag = False
|
||||
if down_sample is not None:
|
||||
self.down_sample = down_sample
|
||||
self.down_sample_flag = True
|
||||
|
||||
self.cast = P.Cast()
|
||||
self.add = TensorAdd()
|
||||
|
||||
def construct(self, x):
|
||||
identity = x
|
||||
out = self.conv1(x)
|
||||
out = self.bn1(out)
|
||||
out = self.relu(out)
|
||||
|
||||
out = self.conv2(out)
|
||||
out = self.bn2(out)
|
||||
out = self.relu(out)
|
||||
out = self.conv3(out)
|
||||
out = self.bn3(out)
|
||||
|
||||
if self.use_se:
|
||||
out = self.se(out)
|
||||
|
||||
if self.down_sample_flag:
|
||||
identity = self.down_sample(x)
|
||||
|
||||
out = self.add(out, identity)
|
||||
out = self.relu(out)
|
||||
return out
|
||||
|
||||
|
||||
class ResNeXt(nn.Cell):
|
||||
"""
|
||||
ResNeXt architecture.
|
||||
|
||||
Args:
|
||||
block (cell): Block for network.
|
||||
layers (list): Numbers of block in different layers.
|
||||
width_per_group (int): Width of every group.
|
||||
groups (int): Groups number.
|
||||
|
||||
Returns:
|
||||
Tuple, output tensor tuple.
|
||||
|
||||
Examples:
|
||||
>>>ResNeXt()
|
||||
"""
|
||||
|
||||
def __init__(self, block, layers, width_per_group=64, groups=1, use_se=False, platform="Ascend"):
|
||||
super(ResNeXt, self).__init__()
|
||||
self.in_channels = 64
|
||||
self.groups = groups
|
||||
self.base_width = width_per_group
|
||||
|
||||
self.conv = conv7x7(3, self.in_channels, stride=2, padding=3)
|
||||
self.bn = nn.BatchNorm2d(self.in_channels)
|
||||
self.relu = P.ReLU()
|
||||
self.maxpool = nn.MaxPool2d(kernel_size=3, stride=2, pad_mode='same')
|
||||
|
||||
self.layer1 = self._make_layer(
|
||||
block, 64, layers[0], use_se=use_se, platform=platform)
|
||||
self.layer2 = self._make_layer(
|
||||
block, 128, layers[1], stride=2, use_se=use_se, platform=platform)
|
||||
self.layer3 = self._make_layer(
|
||||
block, 256, layers[2], stride=2, use_se=use_se, platform=platform)
|
||||
self.layer4 = self._make_layer(
|
||||
block, 512, layers[3], stride=2, use_se=use_se, platform=platform)
|
||||
|
||||
self.out_channels = 512 * block.expansion
|
||||
self.cast = P.Cast()
|
||||
|
||||
def construct(self, x):
|
||||
x = self.conv(x)
|
||||
x = self.bn(x)
|
||||
x = self.relu(x)
|
||||
x = self.maxpool(x)
|
||||
x = self.layer1(x)
|
||||
x = self.layer2(x)
|
||||
x = self.layer3(x)
|
||||
x = self.layer4(x)
|
||||
|
||||
return x
|
||||
|
||||
def _make_layer(self, block, out_channels, blocks_num, stride=1, use_se=False, platform="Ascend"):
|
||||
"""_make_layer"""
|
||||
down_sample = None
|
||||
if stride != 1 or self.in_channels != out_channels * block.expansion:
|
||||
down_sample = _DownSample(self.in_channels,
|
||||
out_channels * block.expansion,
|
||||
stride=stride)
|
||||
|
||||
layers = []
|
||||
layers.append(block(self.in_channels,
|
||||
out_channels,
|
||||
stride=stride,
|
||||
down_sample=down_sample,
|
||||
base_width=self.base_width,
|
||||
groups=self.groups,
|
||||
use_se=use_se,
|
||||
platform=platform))
|
||||
self.in_channels = out_channels * block.expansion
|
||||
for _ in range(1, blocks_num):
|
||||
layers.append(block(self.in_channels, out_channels, base_width=self.base_width,
|
||||
groups=self.groups, use_se=use_se, platform=platform))
|
||||
|
||||
return nn.SequentialCell(layers)
|
||||
|
||||
def get_out_channels(self):
|
||||
return self.out_channels
|
||||
|
||||
|
||||
def resnext50(platform="Ascend"):
|
||||
return ResNeXt(Bottleneck, [3, 4, 6, 3], width_per_group=4, groups=32, platform=platform)
|
||||
|
||||
|
||||
def resnext101(platform="Ascend"):
|
||||
return ResNeXt(Bottleneck, [3, 4, 23, 3], width_per_group=4, groups=64, platform=platform)
|
|
@ -1,41 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
define loss function for network.
|
||||
"""
|
||||
from mindspore.nn.loss.loss import Loss
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore import Tensor
|
||||
from mindspore.common import dtype as mstype
|
||||
import mindspore.nn as nn
|
||||
|
||||
class CrossEntropy(Loss):
|
||||
"""
|
||||
the redefined loss function with SoftmaxCrossEntropyWithLogits.
|
||||
"""
|
||||
def __init__(self, smooth_factor=0., num_classes=1000):
|
||||
super(CrossEntropy, self).__init__()
|
||||
self.onehot = P.OneHot()
|
||||
self.on_value = Tensor(1.0 - smooth_factor, mstype.float32)
|
||||
self.off_value = Tensor(1.0 * smooth_factor / (num_classes -1), mstype.float32)
|
||||
self.ce = nn.SoftmaxCrossEntropyWithLogits()
|
||||
self.mean = P.ReduceMean(False)
|
||||
|
||||
def construct(self, logit, label):
|
||||
one_hot_label = self.onehot(label, F.shape(logit)[1], self.on_value, self.off_value)
|
||||
loss = self.ce(logit, one_hot_label)
|
||||
loss = self.mean(loss, 0)
|
||||
return loss
|
|
@ -1,158 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
dataset processing.
|
||||
"""
|
||||
import os
|
||||
from PIL import Image, ImageFile
|
||||
from mindspore.common import dtype as mstype
|
||||
import mindspore.dataset as de
|
||||
import mindspore.dataset.transforms.c_transforms as C
|
||||
import mindspore.dataset.vision.c_transforms as V_C
|
||||
from src.utils.sampler import DistributedSampler
|
||||
|
||||
ImageFile.LOAD_TRUNCATED_IMAGES = True
|
||||
|
||||
|
||||
class TxtDataset():
|
||||
"""
|
||||
create txt dataset.
|
||||
|
||||
Args:
|
||||
Returns:
|
||||
de_dataset.
|
||||
"""
|
||||
|
||||
def __init__(self, root, txt_name):
|
||||
super(TxtDataset, self).__init__()
|
||||
self.imgs = []
|
||||
self.labels = []
|
||||
fin = open(txt_name, "r")
|
||||
for line in fin:
|
||||
img_name, label = line.strip().split(' ')
|
||||
self.imgs.append(os.path.join(root, img_name))
|
||||
self.labels.append(int(label))
|
||||
fin.close()
|
||||
|
||||
def __getitem__(self, index):
|
||||
img = Image.open(self.imgs[index]).convert('RGB')
|
||||
return img, self.labels[index]
|
||||
|
||||
def __len__(self):
|
||||
return len(self.imgs)
|
||||
|
||||
|
||||
def classification_dataset(data_dir, image_size, per_batch_size, max_epoch, rank, group_size,
|
||||
mode='train',
|
||||
input_mode='folder',
|
||||
root='',
|
||||
num_parallel_workers=None,
|
||||
shuffle=None,
|
||||
sampler=None,
|
||||
class_indexing=None,
|
||||
drop_remainder=True,
|
||||
transform=None,
|
||||
target_transform=None):
|
||||
"""
|
||||
A function that returns a dataset for classification. The mode of input dataset could be "folder" or "txt".
|
||||
If it is "folder", all images within one folder have the same label. If it is "txt", all paths of images
|
||||
are written into a textfile.
|
||||
|
||||
Args:
|
||||
data_dir (str): Path to the root directory that contains the dataset for "input_mode="folder"".
|
||||
Or path of the textfile that contains every image's path of the dataset.
|
||||
image_size (Union(int, sequence)): Size of the input images.
|
||||
per_batch_size (int): the batch size of evey step during training.
|
||||
max_epoch (int): the number of epochs.
|
||||
rank (int): The shard ID within num_shards (default=None).
|
||||
group_size (int): Number of shards that the dataset should be divided
|
||||
into (default=None).
|
||||
mode (str): "train" or others. Default: " train".
|
||||
input_mode (str): The form of the input dataset. "folder" or "txt". Default: "folder".
|
||||
root (str): the images path for "input_mode="txt"". Default: " ".
|
||||
num_parallel_workers (int): Number of workers to read the data. Default: None.
|
||||
shuffle (bool): Whether or not to perform shuffle on the dataset
|
||||
(default=None, performs shuffle).
|
||||
sampler (Sampler): Object used to choose samples from the dataset. Default: None.
|
||||
class_indexing (dict): A str-to-int mapping from folder name to index
|
||||
(default=None, the folder names will be sorted
|
||||
alphabetically and each class will be given a
|
||||
unique index starting from 0).
|
||||
|
||||
Examples:
|
||||
>>> from src.dataset import classification_dataset
|
||||
>>> # path to imagefolder directory. This directory needs to contain sub-directories which contain the images
|
||||
>>> data_dir = "/path/to/imagefolder_directory"
|
||||
>>> de_dataset = classification_dataset(data_dir, image_size=[224, 244],
|
||||
>>> per_batch_size=64, max_epoch=100,
|
||||
>>> rank=0, group_size=4)
|
||||
>>> # Path of the textfile that contains every image's path of the dataset.
|
||||
>>> data_dir = "/path/to/dataset/images/train.txt"
|
||||
>>> images_dir = "/path/to/dataset/images"
|
||||
>>> de_dataset = classification_dataset(data_dir, image_size=[224, 244],
|
||||
>>> per_batch_size=64, max_epoch=100,
|
||||
>>> rank=0, group_size=4,
|
||||
>>> input_mode="txt", root=images_dir)
|
||||
"""
|
||||
|
||||
mean = [0.485 * 255, 0.456 * 255, 0.406 * 255]
|
||||
std = [0.229 * 255, 0.224 * 255, 0.225 * 255]
|
||||
|
||||
if transform is None:
|
||||
if mode == 'train':
|
||||
transform_img = [
|
||||
V_C.RandomCropDecodeResize(image_size, scale=(0.08, 1.0), ratio=(0.75, 1.333)),
|
||||
V_C.RandomHorizontalFlip(prob=0.5),
|
||||
V_C.RandomColorAdjust(brightness=0.4, contrast=0.4, saturation=0.4),
|
||||
V_C.Normalize(mean=mean, std=std),
|
||||
V_C.HWC2CHW()
|
||||
]
|
||||
else:
|
||||
transform_img = [
|
||||
V_C.Decode(),
|
||||
V_C.Resize((256, 256)),
|
||||
V_C.CenterCrop(image_size),
|
||||
V_C.Normalize(mean=mean, std=std),
|
||||
V_C.HWC2CHW()
|
||||
]
|
||||
else:
|
||||
transform_img = transform
|
||||
|
||||
if target_transform is None:
|
||||
transform_label = [C.TypeCast(mstype.int32)]
|
||||
else:
|
||||
transform_label = target_transform
|
||||
|
||||
if input_mode == 'folder':
|
||||
de_dataset = de.ImageFolderDataset(data_dir, num_parallel_workers=num_parallel_workers,
|
||||
shuffle=shuffle, sampler=sampler, class_indexing=class_indexing,
|
||||
num_shards=group_size, shard_id=rank)
|
||||
else:
|
||||
dataset = TxtDataset(root, data_dir)
|
||||
sampler = DistributedSampler(dataset, rank, group_size, shuffle=shuffle)
|
||||
de_dataset = de.GeneratorDataset(dataset, ["image", "label"], sampler=sampler)
|
||||
|
||||
de_dataset = de_dataset.map(operations=transform_img, input_columns="image",
|
||||
num_parallel_workers=num_parallel_workers)
|
||||
de_dataset = de_dataset.map(operations=transform_label, input_columns="label",
|
||||
num_parallel_workers=num_parallel_workers)
|
||||
|
||||
columns_to_project = ["image", "label"]
|
||||
de_dataset = de_dataset.project(columns=columns_to_project)
|
||||
|
||||
de_dataset = de_dataset.batch(per_batch_size, drop_remainder=drop_remainder)
|
||||
de_dataset = de_dataset.repeat(max_epoch)
|
||||
|
||||
return de_dataset
|
|
@ -1,42 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
common architecture.
|
||||
"""
|
||||
import mindspore.nn as nn
|
||||
from src.utils.cunstom_op import GlobalAvgPooling
|
||||
|
||||
__all__ = ['CommonHead']
|
||||
|
||||
class CommonHead(nn.Cell):
|
||||
"""
|
||||
common architecture definition.
|
||||
|
||||
Args:
|
||||
num_classes (int): Number of classes.
|
||||
out_channels (int): Output channels.
|
||||
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
"""
|
||||
def __init__(self, num_classes, out_channels):
|
||||
super(CommonHead, self).__init__()
|
||||
self.avgpool = GlobalAvgPooling()
|
||||
self.fc = nn.Dense(out_channels, num_classes, has_bias=True).add_flags_recursive(fp16=True)
|
||||
|
||||
def construct(self, x):
|
||||
x = self.avgpool(x)
|
||||
x = self.fc(x)
|
||||
return x
|
|
@ -1,104 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
Image classifiation.
|
||||
"""
|
||||
import math
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common import initializer as init
|
||||
import src.backbone as backbones
|
||||
import src.head as heads
|
||||
from src.utils.var_init import default_recurisive_init, KaimingNormal
|
||||
|
||||
|
||||
class ImageClassificationNetwork(nn.Cell):
|
||||
"""
|
||||
architecture of image classification network.
|
||||
|
||||
Args:
|
||||
Returns:
|
||||
Tensor, output tensor.
|
||||
"""
|
||||
|
||||
def __init__(self, backbone, head, include_top=True, activation="None"):
|
||||
super(ImageClassificationNetwork, self).__init__()
|
||||
self.backbone = backbone
|
||||
self.include_top = include_top
|
||||
self.need_activation = False
|
||||
if self.include_top:
|
||||
self.head = head
|
||||
if activation != "None":
|
||||
self.need_activation = True
|
||||
if activation == "Sigmoid":
|
||||
self.activation = P.Sigmoid()
|
||||
elif activation == "Softmax":
|
||||
self.activation = P.Softmax()
|
||||
else:
|
||||
raise NotImplementedError(
|
||||
f"The activation {activation} not in [Sigmoid, Softmax].")
|
||||
|
||||
def construct(self, x):
|
||||
x = self.backbone(x)
|
||||
if self.include_top:
|
||||
x = self.head(x)
|
||||
if self.need_activation:
|
||||
x = self.activation(x)
|
||||
return x
|
||||
|
||||
|
||||
class ResNeXt(ImageClassificationNetwork):
|
||||
"""
|
||||
ResNeXt architecture.
|
||||
Args:
|
||||
backbone_name (string): backbone.
|
||||
num_classes (int): number of classes, Default is 1000.
|
||||
Returns:
|
||||
Resnet.
|
||||
"""
|
||||
|
||||
def __init__(self, backbone_name, num_classes=1000, platform="Ascend", include_top=True, activation="None"):
|
||||
self.backbone_name = backbone_name
|
||||
backbone = backbones.__dict__[self.backbone_name](platform=platform)
|
||||
out_channels = backbone.get_out_channels()
|
||||
head = heads.CommonHead(num_classes=num_classes,
|
||||
out_channels=out_channels)
|
||||
super(ResNeXt, self).__init__(backbone, head, include_top, activation)
|
||||
|
||||
default_recurisive_init(self)
|
||||
|
||||
for cell in self.cells_and_names():
|
||||
if isinstance(cell, nn.Conv2d):
|
||||
cell.weight.set_data(init.initializer(
|
||||
KaimingNormal(a=math.sqrt(5), mode='fan_out',
|
||||
nonlinearity='relu'),
|
||||
cell.weight.shape, cell.weight.dtype))
|
||||
elif isinstance(cell, nn.BatchNorm2d):
|
||||
cell.gamma.set_data(init.initializer('ones', cell.gamma.shape))
|
||||
cell.beta.set_data(init.initializer('zeros', cell.beta.shape))
|
||||
|
||||
# Zero-initialize the last BN in each residual branch,
|
||||
# so that the residual branch starts with zeros, and each residual block behaves like an identity.
|
||||
# This improves the model by 0.2~0.3% according to https://arxiv.org/abs/1706.02677
|
||||
for cell in self.cells_and_names():
|
||||
if isinstance(cell, backbones.resnext.Bottleneck):
|
||||
cell.bn3.gamma.set_data(init.initializer(
|
||||
'zeros', cell.bn3.gamma.shape))
|
||||
elif isinstance(cell, backbones.resnext.BasicBlock):
|
||||
cell.bn2.gamma.set_data(init.initializer(
|
||||
'zeros', cell.bn2.gamma.shape))
|
||||
|
||||
|
||||
def get_network(**kwargs):
|
||||
return ResNeXt('resnext101', **kwargs)
|
|
@ -1,142 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
learning rate generator.
|
||||
"""
|
||||
import math
|
||||
from collections import Counter
|
||||
import numpy as np
|
||||
|
||||
|
||||
def linear_warmup_lr(current_step, warmup_steps, base_lr, init_lr):
|
||||
"""
|
||||
Applies liner Increasing to generate learning rate array in warmup stage.
|
||||
|
||||
Args:
|
||||
current_step(int): current step in warmup stage.
|
||||
warmup_steps(int): all steps in warmup stage.
|
||||
base_lr(float): init learning rate.
|
||||
init_lr(float): end learning rate
|
||||
|
||||
Returns:
|
||||
float, learning rate.
|
||||
"""
|
||||
lr_inc = (float(base_lr) - float(init_lr)) / float(warmup_steps)
|
||||
lr = float(init_lr) + lr_inc * current_step
|
||||
return lr
|
||||
|
||||
|
||||
def warmup_cosine_annealing_lr(lr, steps_per_epoch, warmup_epochs, max_epoch, T_max, eta_min=0):
|
||||
"""
|
||||
Applies cosine decay to generate learning rate array with warmup.
|
||||
|
||||
Args:
|
||||
lr(float): init learning rate
|
||||
steps_per_epoch(int): steps of one epoch
|
||||
warmup_epochs(int): number of warmup epochs
|
||||
max_epoch(int): total epoch of training
|
||||
T_max(int): max epoch in decay.
|
||||
eta_min(float): end learning rate
|
||||
|
||||
Returns:
|
||||
np.array, learning rate array.
|
||||
"""
|
||||
base_lr = lr
|
||||
warmup_init_lr = 0
|
||||
total_steps = int(max_epoch * steps_per_epoch)
|
||||
warmup_steps = int(warmup_epochs * steps_per_epoch)
|
||||
|
||||
lr_each_step = []
|
||||
for i in range(total_steps):
|
||||
last_epoch = i // steps_per_epoch
|
||||
if i < warmup_steps:
|
||||
lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr)
|
||||
else:
|
||||
lr = eta_min + (base_lr - eta_min) * (1. + math.cos(math.pi * last_epoch / T_max)) / 2
|
||||
lr_each_step.append(lr)
|
||||
|
||||
return np.array(lr_each_step).astype(np.float32)
|
||||
|
||||
|
||||
def warmup_step_lr(lr, lr_epochs, steps_per_epoch, warmup_epochs, max_epoch, gamma=0.1):
|
||||
"""
|
||||
Applies step decay to generate learning rate array with warmup.
|
||||
|
||||
Args:
|
||||
lr(float): init learning rate
|
||||
lr_epochs(list): learning rate decay epoches list
|
||||
steps_per_epoch(int): steps of one epoch
|
||||
warmup_epochs(int): number of warmup epochs
|
||||
max_epoch(int): total epoch of training
|
||||
gamma(float): attenuation constants.
|
||||
|
||||
Returns:
|
||||
np.array, learning rate array.
|
||||
"""
|
||||
base_lr = lr
|
||||
warmup_init_lr = 0
|
||||
total_steps = int(max_epoch * steps_per_epoch)
|
||||
warmup_steps = int(warmup_epochs * steps_per_epoch)
|
||||
milestones = lr_epochs
|
||||
milestones_steps = []
|
||||
for milestone in milestones:
|
||||
milestones_step = milestone * steps_per_epoch
|
||||
milestones_steps.append(milestones_step)
|
||||
|
||||
lr_each_step = []
|
||||
lr = base_lr
|
||||
milestones_steps_counter = Counter(milestones_steps)
|
||||
for i in range(total_steps):
|
||||
if i < warmup_steps:
|
||||
lr = linear_warmup_lr(i + 1, warmup_steps, base_lr, warmup_init_lr)
|
||||
else:
|
||||
lr = lr * gamma**milestones_steps_counter[i]
|
||||
lr_each_step.append(lr)
|
||||
|
||||
return np.array(lr_each_step).astype(np.float32)
|
||||
|
||||
|
||||
def multi_step_lr(lr, milestones, steps_per_epoch, max_epoch, gamma=0.1):
|
||||
return warmup_step_lr(lr, milestones, steps_per_epoch, 0, max_epoch, gamma=gamma)
|
||||
|
||||
|
||||
def step_lr(lr, epoch_size, steps_per_epoch, max_epoch, gamma=0.1):
|
||||
lr_epochs = []
|
||||
for i in range(1, max_epoch):
|
||||
if i % epoch_size == 0:
|
||||
lr_epochs.append(i)
|
||||
return multi_step_lr(lr, lr_epochs, steps_per_epoch, max_epoch, gamma=gamma)
|
||||
|
||||
|
||||
def get_lr(args):
|
||||
"""generate learning rate array."""
|
||||
if args.lr_scheduler == 'exponential':
|
||||
lr = warmup_step_lr(args.lr,
|
||||
args.lr_epochs,
|
||||
args.steps_per_epoch,
|
||||
args.warmup_epochs,
|
||||
args.max_epoch,
|
||||
gamma=args.lr_gamma,
|
||||
)
|
||||
elif args.lr_scheduler == 'cosine_annealing':
|
||||
lr = warmup_cosine_annealing_lr(args.lr,
|
||||
args.steps_per_epoch,
|
||||
args.warmup_epochs,
|
||||
args.max_epoch,
|
||||
args.T_max,
|
||||
args.eta_min)
|
||||
else:
|
||||
raise NotImplementedError(args.lr_scheduler)
|
||||
return lr
|
|
@ -1,53 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""Auto mixed precision."""
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import functional as F
|
||||
from mindspore._checkparam import Validator as validator
|
||||
from mindspore.common import dtype as mstype
|
||||
|
||||
|
||||
class OutputTo(nn.Cell):
|
||||
"Cast cell output back to float16 or float32"
|
||||
|
||||
def __init__(self, op, to_type=mstype.float16):
|
||||
super(OutputTo, self).__init__(auto_prefix=False)
|
||||
self._op = op
|
||||
validator.check_type_name('to_type', to_type, [mstype.float16, mstype.float32], None)
|
||||
self.to_type = to_type
|
||||
|
||||
def construct(self, x):
|
||||
return F.cast(self._op(x), self.to_type)
|
||||
|
||||
|
||||
def auto_mixed_precision(network):
|
||||
"""Do keep batchnorm fp32."""
|
||||
cells = network.name_cells()
|
||||
change = False
|
||||
network.to_float(mstype.float16)
|
||||
for name in cells:
|
||||
subcell = cells[name]
|
||||
if subcell == network:
|
||||
continue
|
||||
elif name == 'fc':
|
||||
network.insert_child_to_cell(name, OutputTo(subcell, mstype.float32))
|
||||
change = True
|
||||
elif isinstance(subcell, (nn.BatchNorm2d, nn.BatchNorm1d)):
|
||||
network.insert_child_to_cell(name, OutputTo(subcell.to_float(mstype.float32), mstype.float16))
|
||||
change = True
|
||||
else:
|
||||
auto_mixed_precision(subcell)
|
||||
if isinstance(network, nn.SequentialCell) and change:
|
||||
network.cell_list = list(network.cells())
|
|
@ -1,104 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
network operations
|
||||
"""
|
||||
import mindspore.nn as nn
|
||||
from mindspore.ops import operations as P
|
||||
from mindspore.common import dtype as mstype
|
||||
|
||||
|
||||
class GlobalAvgPooling(nn.Cell):
|
||||
"""
|
||||
global average pooling feature map.
|
||||
|
||||
Args:
|
||||
mean (tuple): means for each channel.
|
||||
"""
|
||||
def __init__(self):
|
||||
super(GlobalAvgPooling, self).__init__()
|
||||
self.mean = P.ReduceMean(False)
|
||||
|
||||
def construct(self, x):
|
||||
x = self.mean(x, (2, 3))
|
||||
return x
|
||||
|
||||
|
||||
class SEBlock(nn.Cell):
|
||||
"""
|
||||
squeeze and excitation block.
|
||||
|
||||
Args:
|
||||
channel (int): number of feature maps.
|
||||
reduction (int): weight.
|
||||
"""
|
||||
def __init__(self, channel, reduction=16):
|
||||
super(SEBlock, self).__init__()
|
||||
|
||||
self.avg_pool = GlobalAvgPooling()
|
||||
self.fc1 = nn.Dense(channel, channel // reduction)
|
||||
self.relu = P.ReLU()
|
||||
self.fc2 = nn.Dense(channel // reduction, channel)
|
||||
self.sigmoid = P.Sigmoid()
|
||||
self.reshape = P.Reshape()
|
||||
self.shape = P.Shape()
|
||||
self.sum = P.Sum()
|
||||
self.cast = P.Cast()
|
||||
|
||||
def construct(self, x):
|
||||
b, c = self.shape(x)
|
||||
y = self.avg_pool(x)
|
||||
|
||||
y = self.reshape(y, (b, c))
|
||||
y = self.fc1(y)
|
||||
y = self.relu(y)
|
||||
y = self.fc2(y)
|
||||
y = self.sigmoid(y)
|
||||
y = self.reshape(y, (b, c, 1, 1))
|
||||
return x * y
|
||||
|
||||
class GroupConv(nn.Cell):
|
||||
"""
|
||||
group convolution operation.
|
||||
|
||||
Args:
|
||||
in_channels (int): Input channels of feature map.
|
||||
out_channels (int): Output channels of feature map.
|
||||
kernel_size (int): Size of convolution kernel.
|
||||
stride (int): Stride size for the group convolution layer.
|
||||
|
||||
Returns:
|
||||
tensor, output tensor.
|
||||
"""
|
||||
def __init__(self, in_channels, out_channels, kernel_size, stride, pad_mode="pad", pad=0, groups=1, has_bias=False):
|
||||
super(GroupConv, self).__init__()
|
||||
assert in_channels % groups == 0 and out_channels % groups == 0
|
||||
self.groups = groups
|
||||
self.convs = nn.CellList()
|
||||
self.op_split = P.Split(axis=1, output_num=self.groups)
|
||||
self.op_concat = P.Concat(axis=1)
|
||||
self.cast = P.Cast()
|
||||
for _ in range(groups):
|
||||
self.convs.append(nn.Conv2d(in_channels//groups, out_channels//groups,
|
||||
kernel_size=kernel_size, stride=stride, has_bias=has_bias,
|
||||
padding=pad, pad_mode=pad_mode, group=1))
|
||||
|
||||
def construct(self, x):
|
||||
features = self.op_split(x)
|
||||
outputs = ()
|
||||
for i in range(self.groups):
|
||||
outputs = outputs + (self.convs[i](self.cast(features[i], mstype.float32)),)
|
||||
out = self.op_concat(outputs)
|
||||
return out
|
|
@ -1,82 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
get logger.
|
||||
"""
|
||||
import logging
|
||||
import os
|
||||
import sys
|
||||
from datetime import datetime
|
||||
|
||||
class LOGGER(logging.Logger):
|
||||
"""
|
||||
set up logging file.
|
||||
|
||||
Args:
|
||||
logger_name (string): logger name.
|
||||
log_dir (string): path of logger.
|
||||
|
||||
Returns:
|
||||
string, logger path
|
||||
"""
|
||||
def __init__(self, logger_name, rank=0):
|
||||
super(LOGGER, self).__init__(logger_name)
|
||||
if rank % 8 == 0:
|
||||
console = logging.StreamHandler(sys.stdout)
|
||||
console.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
|
||||
console.setFormatter(formatter)
|
||||
self.addHandler(console)
|
||||
|
||||
def setup_logging_file(self, log_dir, rank=0):
|
||||
"""set up log file"""
|
||||
self.rank = rank
|
||||
if not os.path.exists(log_dir):
|
||||
os.makedirs(log_dir, exist_ok=True)
|
||||
log_name = datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S') + '_rank_{}.log'.format(rank)
|
||||
self.log_fn = os.path.join(log_dir, log_name)
|
||||
fh = logging.FileHandler(self.log_fn)
|
||||
fh.setLevel(logging.INFO)
|
||||
formatter = logging.Formatter('%(asctime)s:%(levelname)s:%(message)s')
|
||||
fh.setFormatter(formatter)
|
||||
self.addHandler(fh)
|
||||
|
||||
def info(self, msg, *args, **kwargs):
|
||||
if self.isEnabledFor(logging.INFO):
|
||||
self._log(logging.INFO, msg, args, **kwargs)
|
||||
|
||||
def save_args(self, args):
|
||||
self.info('Args:')
|
||||
args_dict = vars(args)
|
||||
for key in args_dict.keys():
|
||||
self.info('--> %s: %s', key, args_dict[key])
|
||||
self.info('')
|
||||
|
||||
def important_info(self, msg, *args, **kwargs):
|
||||
if self.isEnabledFor(logging.INFO) and self.rank == 0:
|
||||
line_width = 2
|
||||
important_msg = '\n'
|
||||
important_msg += ('*'*70 + '\n')*line_width
|
||||
important_msg += ('*'*line_width + '\n')*2
|
||||
important_msg += '*'*line_width + ' '*8 + msg + '\n'
|
||||
important_msg += ('*'*line_width + '\n')*2
|
||||
important_msg += ('*'*70 + '\n')*line_width
|
||||
self.info(important_msg, *args, **kwargs)
|
||||
|
||||
|
||||
def get_logger(path, rank):
|
||||
logger = LOGGER("mindversion", rank)
|
||||
logger.setup_logging_file(path, rank)
|
||||
return logger
|
|
@ -1,36 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
optimizer parameters.
|
||||
"""
|
||||
def get_param_groups(network):
|
||||
"""get param groups"""
|
||||
decay_params = []
|
||||
no_decay_params = []
|
||||
for x in network.trainable_params():
|
||||
parameter_name = x.name
|
||||
if parameter_name.endswith('.bias'):
|
||||
# all bias not using weight decay
|
||||
no_decay_params.append(x)
|
||||
elif parameter_name.endswith('.gamma'):
|
||||
# bn weight bias not using weight decay, be carefully for now x not include BN
|
||||
no_decay_params.append(x)
|
||||
elif parameter_name.endswith('.beta'):
|
||||
# bn weight bias not using weight decay, be carefully for now x not include BN
|
||||
no_decay_params.append(x)
|
||||
else:
|
||||
decay_params.append(x)
|
||||
|
||||
return [{'params': no_decay_params, 'weight_decay': 0.0}, {'params': decay_params}]
|
|
@ -1,53 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
choose samples from the dataset
|
||||
"""
|
||||
import math
|
||||
import numpy as np
|
||||
|
||||
class DistributedSampler():
|
||||
"""
|
||||
sampling the dataset.
|
||||
|
||||
Args:
|
||||
Returns:
|
||||
num_samples, number of samples.
|
||||
"""
|
||||
def __init__(self, dataset, rank, group_size, shuffle=True, seed=0):
|
||||
self.dataset = dataset
|
||||
self.rank = rank
|
||||
self.group_size = group_size
|
||||
self.dataset_length = len(self.dataset)
|
||||
self.num_samples = int(math.ceil(self.dataset_length * 1.0 / self.group_size))
|
||||
self.total_size = self.num_samples * self.group_size
|
||||
self.shuffle = shuffle
|
||||
self.seed = seed
|
||||
|
||||
def __iter__(self):
|
||||
if self.shuffle:
|
||||
self.seed = (self.seed + 1) & 0xffffffff
|
||||
np.random.seed(self.seed)
|
||||
indices = np.random.permutation(self.dataset_length).tolist()
|
||||
else:
|
||||
indices = list(range(len(self.dataset_length)))
|
||||
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
indices = indices[self.rank::self.group_size]
|
||||
return iter(indices)
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samples
|
||||
|
|
@ -1,228 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""
|
||||
Initialize.
|
||||
"""
|
||||
import os
|
||||
import math
|
||||
from functools import reduce
|
||||
import numpy as np
|
||||
import mindspore.nn as nn
|
||||
from mindspore.common import initializer as init
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
|
||||
def _calculate_gain(nonlinearity, param=None):
|
||||
r"""
|
||||
Return the recommended gain value for the given nonlinearity function.
|
||||
|
||||
The values are as follows:
|
||||
================= ====================================================
|
||||
nonlinearity gain
|
||||
================= ====================================================
|
||||
Linear / Identity :math:`1`
|
||||
Conv{1,2,3}D :math:`1`
|
||||
Sigmoid :math:`1`
|
||||
Tanh :math:`\frac{5}{3}`
|
||||
ReLU :math:`\sqrt{2}`
|
||||
Leaky Relu :math:`\sqrt{\frac{2}{1 + \text{negative\_slope}^2}}`
|
||||
================= ====================================================
|
||||
|
||||
Args:
|
||||
nonlinearity: the non-linear function
|
||||
param: optional parameter for the non-linear function
|
||||
|
||||
Examples:
|
||||
>>> gain = calculate_gain('leaky_relu', 0.2) # leaky_relu with negative_slope=0.2
|
||||
"""
|
||||
linear_fns = ['linear', 'conv1d', 'conv2d', 'conv3d', 'conv_transpose1d', 'conv_transpose2d', 'conv_transpose3d']
|
||||
if nonlinearity in linear_fns or nonlinearity == 'sigmoid':
|
||||
return 1
|
||||
if nonlinearity == 'tanh':
|
||||
return 5.0 / 3
|
||||
if nonlinearity == 'relu':
|
||||
return math.sqrt(2.0)
|
||||
if nonlinearity == 'leaky_relu':
|
||||
if param is None:
|
||||
negative_slope = 0.01
|
||||
elif not isinstance(param, bool) and isinstance(param, int) or isinstance(param, float):
|
||||
negative_slope = param
|
||||
else:
|
||||
raise ValueError("negative_slope {} not a valid number".format(param))
|
||||
return math.sqrt(2.0 / (1 + negative_slope ** 2))
|
||||
|
||||
raise ValueError("Unsupported nonlinearity {}".format(nonlinearity))
|
||||
|
||||
def _assignment(arr, num):
|
||||
"""Assign the value of `num` to `arr`."""
|
||||
if arr.shape == ():
|
||||
arr = arr.reshape((1))
|
||||
arr[:] = num
|
||||
arr = arr.reshape(())
|
||||
else:
|
||||
if isinstance(num, np.ndarray):
|
||||
arr[:] = num[:]
|
||||
else:
|
||||
arr[:] = num
|
||||
return arr
|
||||
|
||||
def _calculate_in_and_out(arr):
|
||||
"""
|
||||
Calculate n_in and n_out.
|
||||
|
||||
Args:
|
||||
arr (Array): Input array.
|
||||
|
||||
Returns:
|
||||
Tuple, a tuple with two elements, the first element is `n_in` and the second element is `n_out`.
|
||||
"""
|
||||
dim = len(arr.shape)
|
||||
if dim < 2:
|
||||
raise ValueError("If initialize data with xavier uniform, the dimension of data must greater than 1.")
|
||||
|
||||
n_in = arr.shape[1]
|
||||
n_out = arr.shape[0]
|
||||
|
||||
if dim > 2:
|
||||
counter = reduce(lambda x, y: x * y, arr.shape[2:])
|
||||
n_in *= counter
|
||||
n_out *= counter
|
||||
return n_in, n_out
|
||||
|
||||
def _select_fan(array, mode):
|
||||
mode = mode.lower()
|
||||
valid_modes = ['fan_in', 'fan_out']
|
||||
if mode not in valid_modes:
|
||||
raise ValueError("Mode {} not supported, please use one of {}".format(mode, valid_modes))
|
||||
|
||||
fan_in, fan_out = _calculate_in_and_out(array)
|
||||
return fan_in if mode == 'fan_in' else fan_out
|
||||
|
||||
class KaimingInit(init.Initializer):
|
||||
r"""
|
||||
Base Class. Initialize the array with He kaiming algorithm.
|
||||
|
||||
Args:
|
||||
a: the negative slope of the rectifier used after this layer (only
|
||||
used with ``'leaky_relu'``)
|
||||
mode: either ``'fan_in'`` (default) or ``'fan_out'``. Choosing ``'fan_in'``
|
||||
preserves the magnitude of the variance of the weights in the
|
||||
forward pass. Choosing ``'fan_out'`` preserves the magnitudes in the
|
||||
backwards pass.
|
||||
nonlinearity: the non-linear function, recommended to use only with
|
||||
``'relu'`` or ``'leaky_relu'`` (default).
|
||||
"""
|
||||
def __init__(self, a=0, mode='fan_in', nonlinearity='leaky_relu'):
|
||||
super(KaimingInit, self).__init__()
|
||||
self.mode = mode
|
||||
self.gain = _calculate_gain(nonlinearity, a)
|
||||
def _initialize(self, arr):
|
||||
pass
|
||||
|
||||
|
||||
class KaimingUniform(KaimingInit):
|
||||
r"""
|
||||
Initialize the array with He kaiming uniform algorithm. The resulting tensor will
|
||||
have values sampled from :math:`\mathcal{U}(-\text{bound}, \text{bound})` where
|
||||
|
||||
.. math::
|
||||
\text{bound} = \text{gain} \times \sqrt{\frac{3}{\text{fan\_mode}}}
|
||||
|
||||
Input:
|
||||
arr (Array): The array to be assigned.
|
||||
|
||||
Returns:
|
||||
Array, assigned array.
|
||||
|
||||
Examples:
|
||||
>>> w = np.empty(3, 5)
|
||||
>>> KaimingUniform(w, mode='fan_in', nonlinearity='relu')
|
||||
"""
|
||||
|
||||
def _initialize(self, arr):
|
||||
fan = _select_fan(arr, self.mode)
|
||||
bound = math.sqrt(3.0) * self.gain / math.sqrt(fan)
|
||||
data = np.random.uniform(-bound, bound, arr.shape)
|
||||
|
||||
_assignment(arr, data)
|
||||
|
||||
|
||||
class KaimingNormal(KaimingInit):
|
||||
r"""
|
||||
Initialize the array with He kaiming normal algorithm. The resulting tensor will
|
||||
have values sampled from :math:`\mathcal{N}(0, \text{std}^2)` where
|
||||
|
||||
.. math::
|
||||
\text{std} = \frac{\text{gain}}{\sqrt{\text{fan\_mode}}}
|
||||
|
||||
Input:
|
||||
arr (Array): The array to be assigned.
|
||||
|
||||
Returns:
|
||||
Array, assigned array.
|
||||
|
||||
Examples:
|
||||
>>> w = np.empty(3, 5)
|
||||
>>> KaimingNormal(w, mode='fan_out', nonlinearity='relu')
|
||||
"""
|
||||
|
||||
def _initialize(self, arr):
|
||||
fan = _select_fan(arr, self.mode)
|
||||
std = self.gain / math.sqrt(fan)
|
||||
data = np.random.normal(0, std, arr.shape)
|
||||
|
||||
_assignment(arr, data)
|
||||
|
||||
|
||||
def default_recurisive_init(custom_cell):
|
||||
"""default_recurisive_init"""
|
||||
for _, cell in custom_cell.cells_and_names():
|
||||
if isinstance(cell, nn.Conv2d):
|
||||
cell.weight.set_data(init.initializer(KaimingUniform(a=math.sqrt(5)),
|
||||
cell.weight.shape,
|
||||
cell.weight.dtype))
|
||||
if cell.bias is not None:
|
||||
fan_in, _ = _calculate_in_and_out(cell.weight)
|
||||
bound = 1 / math.sqrt(fan_in)
|
||||
cell.bias.set_data(init.initializer(init.Uniform(bound),
|
||||
cell.bias.shape,
|
||||
cell.bias.dtype))
|
||||
elif isinstance(cell, nn.Dense):
|
||||
cell.weight.set_data(init.initializer(KaimingUniform(a=math.sqrt(5)),
|
||||
cell.weight.shape,
|
||||
cell.weight.dtype))
|
||||
if cell.bias is not None:
|
||||
fan_in, _ = _calculate_in_and_out(cell.weight)
|
||||
bound = 1 / math.sqrt(fan_in)
|
||||
cell.bias.set_data(init.initializer(init.Uniform(bound),
|
||||
cell.bias.shape,
|
||||
cell.bias.dtype))
|
||||
elif isinstance(cell, (nn.BatchNorm2d, nn.BatchNorm1d)):
|
||||
pass
|
||||
|
||||
|
||||
def load_pretrain_model(ckpt_file, network, args):
|
||||
"""load pretrain model."""
|
||||
if os.path.isfile(ckpt_file):
|
||||
param_dict = load_checkpoint(ckpt_file)
|
||||
param_dict_new = {}
|
||||
for key, values in param_dict.items():
|
||||
if key.startswith('moments.'):
|
||||
continue
|
||||
elif key.startswith('network.'):
|
||||
param_dict_new[key[8:]] = values
|
||||
else:
|
||||
param_dict_new[key] = values
|
||||
load_param_into_net(network, param_dict_new)
|
||||
args.logger.info('load model {} success'.format(ckpt_file))
|
|
@ -1,205 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""train ImageNet."""
|
||||
import os
|
||||
import time
|
||||
import datetime
|
||||
|
||||
import mindspore.nn as nn
|
||||
from mindspore import Tensor, context
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.nn.optim import Momentum
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
from mindspore.train.callback import ModelCheckpoint
|
||||
from mindspore.train.callback import CheckpointConfig, Callback
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
|
||||
from mindspore.common import set_seed
|
||||
|
||||
from src.dataset import classification_dataset
|
||||
from src.crossentropy import CrossEntropy
|
||||
from src.lr_generator import get_lr
|
||||
from src.utils.logging import get_logger
|
||||
from src.utils.optimizers__init__ import get_param_groups
|
||||
from src.utils.var_init import load_pretrain_model
|
||||
from src.image_classification import get_network
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
set_seed(1)
|
||||
|
||||
class BuildTrainNetwork(nn.Cell):
|
||||
"""build training network"""
|
||||
def __init__(self, network, criterion):
|
||||
super(BuildTrainNetwork, self).__init__()
|
||||
self.network = network
|
||||
self.criterion = criterion
|
||||
|
||||
def construct(self, input_data, label):
|
||||
output = self.network(input_data)
|
||||
loss = self.criterion(output, label)
|
||||
return loss
|
||||
|
||||
class ProgressMonitor(Callback):
|
||||
"""monitor loss and time"""
|
||||
def __init__(self, args):
|
||||
super(ProgressMonitor, self).__init__()
|
||||
self.me_epoch_start_time = 0
|
||||
self.me_epoch_start_step_num = 0
|
||||
self.args = args
|
||||
self.ckpt_history = []
|
||||
|
||||
def begin(self, run_context):
|
||||
self.args.logger.info('start network train...')
|
||||
|
||||
def epoch_begin(self, run_context):
|
||||
pass
|
||||
|
||||
def epoch_end(self, run_context, *me_args):
|
||||
cb_params = run_context.original_args()
|
||||
me_step = cb_params.cur_step_num - 1
|
||||
|
||||
real_epoch = me_step // self.args.steps_per_epoch
|
||||
time_used = time.time() - self.me_epoch_start_time
|
||||
fps_mean = self.args.per_batch_size * (me_step-self.me_epoch_start_step_num) * self.args.group_size / time_used
|
||||
self.args.logger.info('epoch[{}], iter[{}], loss:{}, mean_fps:{:.2f}'
|
||||
'imgs/sec'.format(real_epoch, me_step, cb_params.net_outputs, fps_mean))
|
||||
|
||||
if self.args.rank_save_ckpt_flag:
|
||||
import glob
|
||||
ckpts = glob.glob(os.path.join(self.args.outputs_dir, '*.ckpt'))
|
||||
for ckpt in ckpts:
|
||||
ckpt_fn = os.path.basename(ckpt)
|
||||
if not ckpt_fn.startswith('{}-'.format(self.args.rank)):
|
||||
continue
|
||||
if ckpt in self.ckpt_history:
|
||||
continue
|
||||
self.ckpt_history.append(ckpt)
|
||||
self.args.logger.info('epoch[{}], iter[{}], loss:{}, ckpt:{},'
|
||||
'ckpt_fn:{}'.format(real_epoch, me_step, cb_params.net_outputs, ckpt, ckpt_fn))
|
||||
|
||||
|
||||
self.me_epoch_start_step_num = me_step
|
||||
self.me_epoch_start_time = time.time()
|
||||
|
||||
def step_begin(self, run_context):
|
||||
pass
|
||||
|
||||
def step_end(self, run_context, *me_args):
|
||||
pass
|
||||
|
||||
def end(self, run_context):
|
||||
self.args.logger.info('end network train...')
|
||||
|
||||
|
||||
def set_parameters():
|
||||
"""parameters"""
|
||||
context.set_context(mode=context.GRAPH_MODE, enable_auto_mixed_precision=True,
|
||||
device_target=config.device_target, save_graphs=False)
|
||||
# init distributed
|
||||
if config.run_distribute:
|
||||
init()
|
||||
config.rank = get_rank()
|
||||
config.group_size = get_group_size()
|
||||
else:
|
||||
config.rank = 0
|
||||
config.group_size = 1
|
||||
|
||||
if config.is_dynamic_loss_scale == 1:
|
||||
config.loss_scale = 1 # for dynamic loss scale can not set loss scale in momentum opt
|
||||
|
||||
# select for master rank save ckpt or all rank save, compatible for model parallel
|
||||
config.rank_save_ckpt_flag = 0
|
||||
if config.is_save_on_master:
|
||||
if config.rank == 0:
|
||||
config.rank_save_ckpt_flag = 1
|
||||
else:
|
||||
config.rank_save_ckpt_flag = 1
|
||||
|
||||
# logger
|
||||
config.outputs_dir = os.path.join(config.output_path,
|
||||
datetime.datetime.now().strftime('%Y-%m-%d_time_%H_%M_%S'))
|
||||
config.logger = get_logger(config.outputs_dir, config.rank)
|
||||
return config
|
||||
|
||||
@moxing_wrapper()
|
||||
def train():
|
||||
"""training process"""
|
||||
set_parameters()
|
||||
if os.getenv('DEVICE_ID', "not_set").isdigit():
|
||||
context.set_context(device_id=int(os.getenv('DEVICE_ID')))
|
||||
|
||||
# init distributed
|
||||
if config.run_distribute:
|
||||
parallel_mode = ParallelMode.DATA_PARALLEL
|
||||
context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.group_size,
|
||||
gradients_mean=True)
|
||||
# dataloader
|
||||
de_dataset = classification_dataset(config.data_path, config.image_size,
|
||||
config.per_batch_size, 1,
|
||||
config.rank, config.group_size, num_parallel_workers=8)
|
||||
de_dataset.map_model = 4 # !!!important
|
||||
config.steps_per_epoch = de_dataset.get_dataset_size()
|
||||
|
||||
config.logger.save_args(config)
|
||||
|
||||
# network
|
||||
config.logger.important_info('start create network')
|
||||
# get network and init
|
||||
network = get_network(num_classes=config.num_classes, platform=config.device_target)
|
||||
|
||||
load_pretrain_model(config.checkpoint_file_path, network, config)
|
||||
|
||||
# lr scheduler
|
||||
lr = get_lr(config)
|
||||
|
||||
# optimizer
|
||||
opt = Momentum(params=get_param_groups(network),
|
||||
learning_rate=Tensor(lr),
|
||||
momentum=config.momentum,
|
||||
weight_decay=config.weight_decay,
|
||||
loss_scale=config.loss_scale)
|
||||
|
||||
|
||||
# loss
|
||||
if not config.label_smooth:
|
||||
config.label_smooth_factor = 0.0
|
||||
loss = CrossEntropy(smooth_factor=config.label_smooth_factor, num_classes=config.num_classes)
|
||||
|
||||
if config.is_dynamic_loss_scale == 1:
|
||||
loss_scale_manager = DynamicLossScaleManager(init_loss_scale=65536, scale_factor=2, scale_window=2000)
|
||||
else:
|
||||
loss_scale_manager = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||
|
||||
model = Model(network, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale_manager,
|
||||
metrics={'acc'}, amp_level="O3")
|
||||
|
||||
# checkpoint save
|
||||
progress_cb = ProgressMonitor(config)
|
||||
callbacks = [progress_cb,]
|
||||
if config.rank_save_ckpt_flag:
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.ckpt_interval * config.steps_per_epoch,
|
||||
keep_checkpoint_max=config.ckpt_save_max)
|
||||
save_ckpt_path = os.path.join(config.outputs_dir, 'ckpt_' + str(config.rank) + '/')
|
||||
ckpt_cb = ModelCheckpoint(config=ckpt_config,
|
||||
directory=save_ckpt_path,
|
||||
prefix='{}'.format(config.rank))
|
||||
callbacks.append(ckpt_cb)
|
||||
|
||||
model.train(config.max_epoch, de_dataset, callbacks=callbacks, dataset_sink_mode=True)
|
||||
|
||||
|
||||
if __name__ == "__main__":
|
||||
train()
|
|
@ -1,18 +0,0 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
cmake . -DMINDSPORE_PATH="`pip3.7 show mindspore-ascend | grep Location | awk '{print $2"/mindspore"}' | xargs realpath`"
|
||||
make
|
|
@ -1,48 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
"""create_imagenet2012_label"""
|
||||
import os
|
||||
import json
|
||||
import argparse
|
||||
|
||||
parser = argparse.ArgumentParser(description="resnet imagenet2012 label")
|
||||
parser.add_argument("--img_path", type=str, required=True, help="imagenet2012 file path.")
|
||||
args = parser.parse_args()
|
||||
|
||||
|
||||
def create_label(file_path):
|
||||
print("[WARNING] Create imagenet label. Currently only use for Imagenet2012!")
|
||||
dirs = os.listdir(file_path)
|
||||
file_list = []
|
||||
for file in dirs:
|
||||
file_list.append(file)
|
||||
file_list = sorted(file_list)
|
||||
|
||||
total = 0
|
||||
img_label = {}
|
||||
for i, file_dir in enumerate(file_list):
|
||||
files = os.listdir(os.path.join(file_path, file_dir))
|
||||
for f in files:
|
||||
img_label[f] = i
|
||||
total += len(files)
|
||||
|
||||
with open("imagenet_label.json", "w+") as label:
|
||||
json.dump(img_label, label)
|
||||
|
||||
print("[INFO] Completed! Total {} data.".format(total))
|
||||
|
||||
|
||||
if __name__ == '__main__':
|
||||
create_label(args.img_path)
|
|
@ -1,125 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Parse arguments"""
|
||||
|
||||
import os
|
||||
import ast
|
||||
import argparse
|
||||
from pprint import pprint, pformat
|
||||
import yaml
|
||||
|
||||
_config_path = "./default_config.yaml"
|
||||
|
||||
class Config:
|
||||
"""
|
||||
Configuration namespace. Convert dictionary to members.
|
||||
"""
|
||||
def __init__(self, cfg_dict):
|
||||
for k, v in cfg_dict.items():
|
||||
if isinstance(v, (list, tuple)):
|
||||
setattr(self, k, [Config(x) if isinstance(x, dict) else x for x in v])
|
||||
else:
|
||||
setattr(self, k, Config(v) if isinstance(v, dict) else v)
|
||||
|
||||
def __str__(self):
|
||||
return pformat(self.__dict__)
|
||||
|
||||
def __repr__(self):
|
||||
return self.__str__()
|
||||
|
||||
|
||||
def parse_cli_to_yaml(parser, cfg, helper=None, choices=None, cfg_path="default_config.yaml"):
|
||||
"""
|
||||
Parse command line arguments to the configuration according to the default yaml.
|
||||
|
||||
Args:
|
||||
parser: Parent parser.
|
||||
cfg: Base configuration.
|
||||
helper: Helper description.
|
||||
cfg_path: Path to the default yaml config.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="[REPLACE THIS at config.py]",
|
||||
parents=[parser])
|
||||
helper = {} if helper is None else helper
|
||||
choices = {} if choices is None else choices
|
||||
for item in cfg:
|
||||
if not isinstance(cfg[item], list) and not isinstance(cfg[item], dict):
|
||||
help_description = helper[item] if item in helper else "Please reference to {}".format(cfg_path)
|
||||
choice = choices[item] if item in choices else None
|
||||
if isinstance(cfg[item], bool):
|
||||
parser.add_argument("--" + item, type=ast.literal_eval, default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
else:
|
||||
parser.add_argument("--" + item, type=type(cfg[item]), default=cfg[item], choices=choice,
|
||||
help=help_description)
|
||||
args = parser.parse_args()
|
||||
return args
|
||||
|
||||
|
||||
def parse_yaml(yaml_path):
|
||||
"""
|
||||
Parse the yaml config file.
|
||||
|
||||
Args:
|
||||
yaml_path: Path to the yaml config.
|
||||
"""
|
||||
with open(yaml_path, 'r') as fin:
|
||||
try:
|
||||
cfgs = yaml.load_all(fin.read(), Loader=yaml.FullLoader)
|
||||
cfgs = [x for x in cfgs]
|
||||
if len(cfgs) == 1:
|
||||
cfg_helper = {}
|
||||
cfg = cfgs[0]
|
||||
elif len(cfgs) == 2:
|
||||
cfg, cfg_helper = cfgs
|
||||
else:
|
||||
raise ValueError("At most 2 docs (config and help description for help) are supported in config yaml")
|
||||
print(cfg_helper)
|
||||
except:
|
||||
raise ValueError("Failed to parse yaml")
|
||||
return cfg, cfg_helper
|
||||
|
||||
|
||||
def merge(args, cfg):
|
||||
"""
|
||||
Merge the base config from yaml file and command line arguments.
|
||||
|
||||
Args:
|
||||
args: Command line arguments.
|
||||
cfg: Base configuration.
|
||||
"""
|
||||
args_var = vars(args)
|
||||
for item in args_var:
|
||||
cfg[item] = args_var[item]
|
||||
return cfg
|
||||
|
||||
|
||||
def get_config():
|
||||
"""
|
||||
Get Config according to the yaml file and cli arguments.
|
||||
"""
|
||||
parser = argparse.ArgumentParser(description="default name", add_help=False)
|
||||
current_dir = os.path.dirname(os.path.abspath(__file__))
|
||||
parser.add_argument("--config_path", type=str, default=os.path.join(current_dir, "../../default_config.yaml"),
|
||||
help="Config file path")
|
||||
path_args, _ = parser.parse_known_args()
|
||||
default, helper = parse_yaml(path_args.config_path)
|
||||
pprint(default)
|
||||
args = parse_cli_to_yaml(parser, default, helper, path_args.config_path)
|
||||
final_config = merge(args, default)
|
||||
return Config(final_config)
|
||||
|
||||
config = get_config()
|
|
@ -1,27 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Device adapter for ModelArts"""
|
||||
|
||||
from src.model_utils.config import config
|
||||
|
||||
if config.enable_modelarts:
|
||||
from src.model_utils.moxing_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
else:
|
||||
from src.model_utils.local_adapter import get_device_id, get_device_num, get_rank_id, get_job_id
|
||||
|
||||
__all__ = [
|
||||
"get_device_id", "get_device_num", "get_rank_id", "get_job_id"
|
||||
]
|
|
@ -1,36 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Local adapter"""
|
||||
|
||||
import os
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
return "Local Job"
|
|
@ -1,115 +0,0 @@
|
|||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
|
||||
"""Moxing adapter for ModelArts"""
|
||||
|
||||
import os
|
||||
import functools
|
||||
from mindspore import context
|
||||
from src.model_utils.config import config
|
||||
|
||||
_global_sync_count = 0
|
||||
|
||||
def get_device_id():
|
||||
device_id = os.getenv('DEVICE_ID', '0')
|
||||
return int(device_id)
|
||||
|
||||
|
||||
def get_device_num():
|
||||
device_num = os.getenv('RANK_SIZE', '1')
|
||||
return int(device_num)
|
||||
|
||||
|
||||
def get_rank_id():
|
||||
global_rank_id = os.getenv('RANK_ID', '0')
|
||||
return int(global_rank_id)
|
||||
|
||||
|
||||
def get_job_id():
|
||||
job_id = os.getenv('JOB_ID')
|
||||
job_id = job_id if job_id != "" else "default"
|
||||
return job_id
|
||||
|
||||
def sync_data(from_path, to_path):
|
||||
"""
|
||||
Download data from remote obs to local directory if the first url is remote url and the second one is local path
|
||||
Upload data from local directory to remote obs in contrast.
|
||||
"""
|
||||
import moxing as mox
|
||||
import time
|
||||
global _global_sync_count
|
||||
sync_lock = "/tmp/copy_sync.lock" + str(_global_sync_count)
|
||||
_global_sync_count += 1
|
||||
|
||||
# Each server contains 8 devices as most.
|
||||
if get_device_id() % min(get_device_num(), 8) == 0 and not os.path.exists(sync_lock):
|
||||
print("from path: ", from_path)
|
||||
print("to path: ", to_path)
|
||||
mox.file.copy_parallel(from_path, to_path)
|
||||
print("===finish data synchronization===")
|
||||
try:
|
||||
os.mknod(sync_lock)
|
||||
except IOError:
|
||||
pass
|
||||
print("===save flag===")
|
||||
|
||||
while True:
|
||||
if os.path.exists(sync_lock):
|
||||
break
|
||||
time.sleep(1)
|
||||
|
||||
print("Finish sync data from {} to {}.".format(from_path, to_path))
|
||||
|
||||
|
||||
def moxing_wrapper(pre_process=None, post_process=None):
|
||||
"""
|
||||
Moxing wrapper to download dataset and upload outputs.
|
||||
"""
|
||||
def wrapper(run_func):
|
||||
@functools.wraps(run_func)
|
||||
def wrapped_func(*args, **kwargs):
|
||||
# Download data from data_url
|
||||
if config.enable_modelarts:
|
||||
if config.data_url:
|
||||
sync_data(config.data_url, config.data_path)
|
||||
print("Dataset downloaded: ", os.listdir(config.data_path))
|
||||
if config.checkpoint_url:
|
||||
sync_data(config.checkpoint_url, config.load_path)
|
||||
print("Preload downloaded: ", os.listdir(config.load_path))
|
||||
if config.train_url:
|
||||
sync_data(config.train_url, config.output_path)
|
||||
print("Workspace downloaded: ", os.listdir(config.output_path))
|
||||
|
||||
context.set_context(save_graphs_path=os.path.join(config.output_path, str(get_rank_id())))
|
||||
config.device_num = get_device_num()
|
||||
config.device_id = get_device_id()
|
||||
if not os.path.exists(config.output_path):
|
||||
os.makedirs(config.output_path)
|
||||
|
||||
if pre_process:
|
||||
pre_process()
|
||||
|
||||
run_func(*args, **kwargs)
|
||||
|
||||
# Upload data to train_url
|
||||
if config.enable_modelarts:
|
||||
if post_process:
|
||||
post_process()
|
||||
|
||||
if config.train_url:
|
||||
print("Start to copy output directory")
|
||||
sync_data(config.output_path, config.train_url)
|
||||
return wrapped_func
|
||||
return wrapper
|
Loading…
Reference in New Issue