!20547 shufflenetv1 add gpu train and eval
Merge pull request !20547 from chenweitao_295/shufflenetv1_gpu
This commit is contained in:
commit
86b07c3693
|
@ -62,8 +62,11 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了
|
|||
├─README_CN.md # ShuffleNetV1相关描述
|
||||
├─scripts
|
||||
├─run_standalone_train.sh # Ascend环境下的单卡训练脚本
|
||||
├─run_standalone_train_gpu.sh # GPU环境下的单卡训练脚本
|
||||
├─run_distribute_train.sh # Ascend环境下的八卡并行训练脚本
|
||||
├─run_distribute_train_gpu.sh # GPU环境下的八卡并行训练脚本
|
||||
├─run_eval.sh # Ascend环境下的评估脚本
|
||||
├─run_eval_gpu.sh # GPU环境下的评估脚本
|
||||
├─run_infer_310.sh # Ascend 310 推理shell脚本
|
||||
├─src
|
||||
├─dataset.py # 数据预处理
|
||||
|
@ -76,6 +79,7 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了
|
|||
├──local_adapter.py # 设备相关信息
|
||||
├──moxing_adapter.py # 装饰器(主要用于ModelArts数据拷贝)
|
||||
├─default_config.yaml # 参数文件
|
||||
├─gpu_default_config.yaml # GPU参数文件
|
||||
├─train.py # 网络训练脚本
|
||||
├─export.py # 模型格式转换脚本
|
||||
├─eval.py # 网络评估脚本
|
||||
|
@ -107,6 +111,8 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了
|
|||
'momentum': 0.9 # Momentum中的动量参数
|
||||
```
|
||||
|
||||
如需获取更多信息,Ascend请查看`default_config.yaml`, GPU请查看`gpu_default_config.yaml`.
|
||||
|
||||
## 训练过程
|
||||
|
||||
### 启动
|
||||
|
@ -115,12 +121,30 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了
|
|||
|
||||
```shell
|
||||
# 训练示例
|
||||
- running on Ascend with default parameters
|
||||
|
||||
python:
|
||||
Ascend单卡训练示例:python train.py --train_dataset_path [DATA_DIR]
|
||||
|
||||
shell:
|
||||
Ascend八卡并行训练: sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_DIR]
|
||||
Ascend单卡训练示例: sh scripts/run_standalone_train.sh [DEVICE_ID] [DATA_DIR]
|
||||
|
||||
- running on GPU with gpu default parameters
|
||||
|
||||
python:
|
||||
GPU单卡训练示例:python train.py --config_path [CONFIG_PATH] --device_target [DEVICE_TARGET]
|
||||
GPU八卡训练示例:
|
||||
export RANK_SIZE=8
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
python train.py --config_path [CONFIG_PATH] \
|
||||
--train_dataset_path [TRAIN_DATA_DIR] \
|
||||
--is_distributed=True \
|
||||
--device_target=GPU > log.txt 2>&1 &
|
||||
|
||||
shell:
|
||||
GPU单卡训练示例: sh scripts/run_standalone_train_gpu.sh [DEVICE_ID] [DATA_DIR]
|
||||
GPU八卡并行训练: sh scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR]
|
||||
```
|
||||
|
||||
分布式训练需要提前创建JSON格式的HCCL配置文件。
|
||||
|
@ -148,15 +172,20 @@ epoch time: 99864.092, per step time: 79.827, avg loss: 3.442
|
|||
您可以使用python或shell脚本进行评估。
|
||||
|
||||
```shell
|
||||
# 评估示例
|
||||
# Ascend评估示例
|
||||
python:
|
||||
python eval.py --eval_dataset_path [DATA_DIR] --ckpt_path [PATH_CHECKPOINT]
|
||||
|
||||
shell:
|
||||
sh scripts/run_eval.sh [DEVICE_ID] [DATA_DIR] [PATH_CHECKPOINT]
|
||||
```
|
||||
|
||||
> 训练过程中可以生成ckpt文件。
|
||||
# GPU评估示例
|
||||
python:
|
||||
python eval.py --config_path [CONFIG_PATH] --eval_dataset_path [DATA_DIR] --ckpt_path [PATH_CHECKPOINT]
|
||||
|
||||
shell:
|
||||
sh scripts/run_eval_gpu.sh [DEVICE_ID] [DATA_DIR] [PATH_CHECKPOINT]
|
||||
```
|
||||
|
||||
### 结果
|
||||
|
||||
|
@ -272,21 +301,21 @@ Densenet121网络使用ImageNet推理得到的结果如下:
|
|||
|
||||
## 训练性能
|
||||
|
||||
| 参数 | Ascend |
|
||||
| -------------------------- | ------------------------------------- |
|
||||
| 模型名称 | ShuffleNetV1 |
|
||||
| 运行环境 | Ascend 910;系统 Euler2.8 |
|
||||
| 上传时间 | 2020-12-3 |
|
||||
| MindSpore 版本 | 1.0.0 |
|
||||
| 数据集 | imagenet |
|
||||
| 训练参数 | src/config.py |
|
||||
| 优化器 | Momentum |
|
||||
| 损失函数 | SoftmaxCrossEntropyWithLogits |
|
||||
| 最终损失 | 2.05 |
|
||||
| 精确度 (8p) | Top1[73.9%], Top5[91.4%] |
|
||||
| 训练总时间 (8p) | 7.0h |
|
||||
| 评估总时间 | 99s |
|
||||
| 参数量 (M) | 44M |
|
||||
| 参数 | Ascend | GPU |
|
||||
| -------------------------- | ------------------------------------- | -------------------------- |
|
||||
| 模型名称 | ShuffleNetV1 | ShuffleNetV1 |
|
||||
| 运行环境 | Ascend 910;系统 Euler2.8 | Tesla V100;系统 Euler2.8 |
|
||||
| 上传时间 | 2020-12-3 | 2021-07-15 |
|
||||
| MindSpore 版本 | 1.0.0 | 1.3.0 |
|
||||
| 数据集 | imagenet | imagenet |
|
||||
| 训练参数 | default_config.yaml | gpu_default_config.yaml |
|
||||
| 优化器 | Momentum | Momentum |
|
||||
| 损失函数 | SoftmaxCrossEntropyWithLogits | SoftmaxCrossEntropyWithLogits |
|
||||
| 最终损失 | 2.05 | 2.04 |
|
||||
| 精确度 (8p) | Top1[73.9%], Top5[91.4%] | Top1[73.8%], Top5[91.4%] |
|
||||
| 训练总时间 (8p) | 7.0h | 20.0h |
|
||||
| 评估总时间 | 99s | 58s |
|
||||
| 参数量 (M) | 44M | 51.3M |
|
||||
| 脚本 | [链接](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/shufflenetv1) |
|
||||
|
||||
# 随机情况的描述
|
||||
|
|
|
@ -0,0 +1,72 @@
|
|||
# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unlesee you know exactly what you are doing)
|
||||
enable_modelarts: False
|
||||
# url for modelarts
|
||||
data_url: ""
|
||||
train_url: ""
|
||||
checkpoint_url: ""
|
||||
# path for local
|
||||
data_path: "/cache/data"
|
||||
output_path: "/cache/train"
|
||||
load_path: "/cache/checkpoint_path"
|
||||
device_target: "GPU"
|
||||
enable_profiling: False
|
||||
|
||||
# ======================================================================================
|
||||
# common options
|
||||
num_classes: 1000
|
||||
label_smooth_factor: 0.1
|
||||
model_size: "2.0x"
|
||||
device_id: 0
|
||||
|
||||
# ======================================================================================
|
||||
# Training options
|
||||
epoch_size: 250
|
||||
keep_checkpoint_max: 5
|
||||
save_ckpt_path: "./"
|
||||
save_checkpoint_epochs: 1
|
||||
save_checkpoint: True
|
||||
amp_level: "O2"
|
||||
is_distributed: False
|
||||
train_dataset_path: ""
|
||||
resume: ""
|
||||
|
||||
# Dataset config
|
||||
batch_size: 128
|
||||
|
||||
#learning rate config
|
||||
decay_method: "cosine"
|
||||
lr_init: 0.00
|
||||
lr_max: 0.50
|
||||
lr_end: 0.00
|
||||
warmup_epochs: 4
|
||||
loss_scale: 1024
|
||||
|
||||
#optimization config
|
||||
weight_decay: 0.00004
|
||||
momentum: 0.9
|
||||
|
||||
# ======================================================================================
|
||||
# Eval options
|
||||
ckpt_path: ""
|
||||
eval_dataset_path: ""
|
||||
|
||||
# ======================================================================================
|
||||
# export options
|
||||
file_name: "shufflenetv1"
|
||||
file_format: "MINDIR"
|
||||
|
||||
---
|
||||
# Help description for each configuration
|
||||
enable_modelarts: "Whether training on modelarts default: False"
|
||||
data_url: "Url for modelarts"
|
||||
train_url: "Url for modelarts"
|
||||
data_path: "The location of input data"
|
||||
output_pah: "The location of the output file"
|
||||
device_target: "device id of GPU or Ascend. (Default: None)"
|
||||
enable_profiling: "Whether enable profiling while training default: False"
|
||||
is_distributed: "distributed training"
|
||||
resume: "resume training with existed checkpoint"
|
||||
model_size: "shuffleNetV1 model size choices 2.0x, 1.5x, 1.0x, 0.5x"
|
||||
device_id: "device id"
|
||||
file_name: "output file name"
|
||||
file_format: "file format choices [AIR MINDIR ONNX]"
|
|
@ -0,0 +1,55 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# run as sh scripts/run_distribute_train.sh RANK_SIZE TRAIN_DATA_DIR
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
if [ $# != 2 ]; then
|
||||
echo "Usage: sh run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR]"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
get_real_path() {
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
export RANK_SIZE=$1
|
||||
export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
|
||||
TRAIN_DATA_DIR=$(get_real_path $2)
|
||||
|
||||
if [ ! -d $TRAIN_DATA_DIR ]; then
|
||||
echo "error: TRAIN_DATA_DIR=$TRAIN_DATA_DIR is not a directory"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
if [ -d "distribute_train" ]; then
|
||||
rm -rf ./distribute_train
|
||||
fi
|
||||
|
||||
mkdir ./distribute_train
|
||||
cp ./*.py ./distribute_train
|
||||
cp ./*.yaml ./distribute_train
|
||||
cp -r ./src ./distribute_train
|
||||
cd ./distribute_train || exit
|
||||
|
||||
mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
|
||||
nohup python train.py \
|
||||
--is_distributed=True \
|
||||
--config_path=./gpu_default_config.yaml \
|
||||
--train_dataset_path=$TRAIN_DATA_DIR \
|
||||
--device_target=GPU > log.txt 2>&1 &
|
||||
cd ..
|
||||
|
|
@ -0,0 +1,42 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# run as sh scripts/run_eval.sh DEVICE_ID DATA_DIR PATH_CHECKPOINT
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
DEVICE_ID=$1
|
||||
DATA_DIR=$(get_real_path $2)
|
||||
PATH_CHECKPOINT=$(get_real_path $3)
|
||||
export CUDA_VISIBLE_DEVICES=$DEVICE_ID
|
||||
|
||||
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
||||
CONFIG_FILE="${BASEPATH}/../gpu_default_config.yaml"
|
||||
|
||||
if [ -d "../eval" ]; then
|
||||
rm -rf ../eval
|
||||
fi
|
||||
mkdir ../eval
|
||||
cd ../eval || exit
|
||||
|
||||
python ${BASEPATH}/../eval.py \
|
||||
--config_path=$CONFIG_FILE \
|
||||
--ckpt_path=$PATH_CHECKPOINT \
|
||||
--eval_dataset_path=$DATA_DIR > eval.log 2>&1 &
|
|
@ -0,0 +1,43 @@
|
|||
#!/bin/bash
|
||||
# Copyright 2021 Huawei Technologies Co., Ltd
|
||||
#
|
||||
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||
# you may not use this file except in compliance with the License.
|
||||
# You may obtain a copy of the License at
|
||||
#
|
||||
# http://www.apache.org/licenses/LICENSE-2.0
|
||||
#
|
||||
# Unless required by applicable law or agreed to in writing, software
|
||||
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||
# See the License for the specific language governing permissions and
|
||||
# run as sh scripts/run_standalone_train.sh DEVICE_ID DATA_DIR
|
||||
# limitations under the License.
|
||||
# ============================================================================
|
||||
get_real_path(){
|
||||
if [ "${1:0:1}" == "/" ]; then
|
||||
echo "$1"
|
||||
else
|
||||
echo "$(realpath -m $PWD/$1)"
|
||||
fi
|
||||
}
|
||||
|
||||
DEVICE_ID=$1
|
||||
DATA_DIR=$(get_real_path $2)
|
||||
export CUDA_VISIBLE_DEVICES=$DEVICE_ID
|
||||
|
||||
BASEPATH=$(cd ./"`dirname $0`" || exit; pwd)
|
||||
CONFIG_FILE="$BASEPATH/../gpu_default_config.yaml"
|
||||
train_path=train_standalone${DEVICE_ID}
|
||||
|
||||
if [ -d ${train_path} ]; then
|
||||
rm -rf ${train_path}
|
||||
fi
|
||||
mkdir -p ${train_path}
|
||||
echo "start training for device $DEVICE_ID"
|
||||
cd ${train_path}|| exit
|
||||
|
||||
python ${BASEPATH}/../train.py \
|
||||
--config_path=$CONFIG_FILE \
|
||||
--train_dataset_path=$DATA_DIR > log.txt 2>&1 &
|
||||
|
|
@ -22,7 +22,7 @@ from mindspore.nn.optim.momentum import Momentum
|
|||
from mindspore.train.model import Model, ParallelMode
|
||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor
|
||||
from mindspore.train.serialization import load_checkpoint, load_param_into_net
|
||||
from mindspore.communication.management import init, get_group_size
|
||||
from mindspore.communication.management import init, get_rank, get_group_size
|
||||
from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
||||
from src.lr_generator import get_lr
|
||||
from src.shufflenetv1 import ShuffleNetV1
|
||||
|
@ -30,8 +30,7 @@ from src.dataset import create_dataset
|
|||
from src.crossentropysmooth import CrossEntropySmooth
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_device_id, get_rank_id
|
||||
|
||||
from src.model_utils.device_adapter import get_device_id
|
||||
|
||||
set_seed(1)
|
||||
|
||||
|
@ -49,7 +48,7 @@ def train():
|
|||
if os.getenv('DEVICE_ID', "not_set").isdigit():
|
||||
context.set_context(device_id=get_device_id())
|
||||
init()
|
||||
rank = get_rank_id()
|
||||
rank = get_rank()
|
||||
group_size = get_group_size()
|
||||
parallel_mode = ParallelMode.DATA_PARALLEL
|
||||
context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True)
|
||||
|
|
Loading…
Reference in New Issue