From d854e28ad85d816e8d8f6a06823479754669e8b5 Mon Sep 17 00:00:00 2001 From: chenweitao_295 Date: Wed, 14 Jul 2021 15:19:15 +0800 Subject: [PATCH] shufflenetv1 add gpu mode --- .../official/cv/shufflenetv1/README_CN.md | 65 ++++++++++++----- .../cv/shufflenetv1/gpu_default_config.yaml | 72 +++++++++++++++++++ .../scripts/run_distribute_train_gpu.sh | 55 ++++++++++++++ .../cv/shufflenetv1/scripts/run_eval_gpu.sh | 42 +++++++++++ .../scripts/run_standalone_train_gpu.sh | 43 +++++++++++ model_zoo/official/cv/shufflenetv1/train.py | 7 +- 6 files changed, 262 insertions(+), 22 deletions(-) create mode 100644 model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml create mode 100644 model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh create mode 100644 model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh create mode 100644 model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh diff --git a/model_zoo/official/cv/shufflenetv1/README_CN.md b/model_zoo/official/cv/shufflenetv1/README_CN.md index 6f5cf9e9355..6c9aceba069 100644 --- a/model_zoo/official/cv/shufflenetv1/README_CN.md +++ b/model_zoo/official/cv/shufflenetv1/README_CN.md @@ -62,8 +62,11 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了 ├─README_CN.md # ShuffleNetV1相关描述 ├─scripts ├─run_standalone_train.sh # Ascend环境下的单卡训练脚本 + ├─run_standalone_train_gpu.sh # GPU环境下的单卡训练脚本 ├─run_distribute_train.sh # Ascend环境下的八卡并行训练脚本 + ├─run_distribute_train_gpu.sh # GPU环境下的八卡并行训练脚本 ├─run_eval.sh # Ascend环境下的评估脚本 + ├─run_eval_gpu.sh # GPU环境下的评估脚本 ├─run_infer_310.sh # Ascend 310 推理shell脚本 ├─src ├─dataset.py # 数据预处理 @@ -76,6 +79,7 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了 ├──local_adapter.py # 设备相关信息 ├──moxing_adapter.py # 装饰器(主要用于ModelArts数据拷贝) ├─default_config.yaml # 参数文件 + ├─gpu_default_config.yaml # GPU参数文件 ├─train.py # 网络训练脚本 ├─export.py # 模型格式转换脚本 ├─eval.py # 网络评估脚本 @@ -107,6 +111,8 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了 'momentum': 0.9 # Momentum中的动量参数 ``` +如需获取更多信息,Ascend请查看`default_config.yaml`, GPU请查看`gpu_default_config.yaml`. + ## 训练过程 ### 启动 @@ -115,12 +121,30 @@ ShuffleNetV1的核心部分被分成三个阶段,每个阶段重复堆积了 ```shell # 训练示例 +- running on Ascend with default parameters + python: Ascend单卡训练示例:python train.py --train_dataset_path [DATA_DIR] shell: Ascend八卡并行训练: sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_DIR] Ascend单卡训练示例: sh scripts/run_standalone_train.sh [DEVICE_ID] [DATA_DIR] + +- running on GPU with gpu default parameters + + python: + GPU单卡训练示例:python train.py --config_path [CONFIG_PATH] --device_target [DEVICE_TARGET] + GPU八卡训练示例: + export RANK_SIZE=8 + mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ + python train.py --config_path [CONFIG_PATH] \ + --train_dataset_path [TRAIN_DATA_DIR] \ + --is_distributed=True \ + --device_target=GPU > log.txt 2>&1 & + + shell: + GPU单卡训练示例: sh scripts/run_standalone_train_gpu.sh [DEVICE_ID] [DATA_DIR] + GPU八卡并行训练: sh scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR] ``` 分布式训练需要提前创建JSON格式的HCCL配置文件。 @@ -148,15 +172,20 @@ epoch time: 99864.092, per step time: 79.827, avg loss: 3.442 您可以使用python或shell脚本进行评估。 ```shell -# 评估示例 +# Ascend评估示例 python: python eval.py --eval_dataset_path [DATA_DIR] --ckpt_path [PATH_CHECKPOINT] shell: sh scripts/run_eval.sh [DEVICE_ID] [DATA_DIR] [PATH_CHECKPOINT] -``` -> 训练过程中可以生成ckpt文件。 +# GPU评估示例 + python: + python eval.py --config_path [CONFIG_PATH] --eval_dataset_path [DATA_DIR] --ckpt_path [PATH_CHECKPOINT] + + shell: + sh scripts/run_eval_gpu.sh [DEVICE_ID] [DATA_DIR] [PATH_CHECKPOINT] +``` ### 结果 @@ -272,21 +301,21 @@ Densenet121网络使用ImageNet推理得到的结果如下: ## 训练性能 -| 参数 | Ascend | -| -------------------------- | ------------------------------------- | -| 模型名称 | ShuffleNetV1 | -| 运行环境 | Ascend 910;系统 Euler2.8 | -| 上传时间 | 2020-12-3 | -| MindSpore 版本 | 1.0.0 | -| 数据集 | imagenet | -| 训练参数 | src/config.py | -| 优化器 | Momentum | -| 损失函数 | SoftmaxCrossEntropyWithLogits | -| 最终损失 | 2.05 | -| 精确度 (8p) | Top1[73.9%], Top5[91.4%] | -| 训练总时间 (8p) | 7.0h | -| 评估总时间 | 99s | -| 参数量 (M) | 44M | +| 参数 | Ascend | GPU | +| -------------------------- | ------------------------------------- | -------------------------- | +| 模型名称 | ShuffleNetV1 | ShuffleNetV1 | +| 运行环境 | Ascend 910;系统 Euler2.8 | Tesla V100;系统 Euler2.8 | +| 上传时间 | 2020-12-3 | 2021-07-15 | +| MindSpore 版本 | 1.0.0 | 1.3.0 | +| 数据集 | imagenet | imagenet | +| 训练参数 | default_config.yaml | gpu_default_config.yaml | +| 优化器 | Momentum | Momentum | +| 损失函数 | SoftmaxCrossEntropyWithLogits | SoftmaxCrossEntropyWithLogits | +| 最终损失 | 2.05 | 2.04 | +| 精确度 (8p) | Top1[73.9%], Top5[91.4%] | Top1[73.8%], Top5[91.4%] | +| 训练总时间 (8p) | 7.0h | 20.0h | +| 评估总时间 | 99s | 58s | +| 参数量 (M) | 44M | 51.3M | | 脚本 | [链接](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/shufflenetv1) | # 随机情况的描述 diff --git a/model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml b/model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml new file mode 100644 index 00000000000..62e98ab50ec --- /dev/null +++ b/model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml @@ -0,0 +1,72 @@ +# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unlesee you know exactly what you are doing) +enable_modelarts: False +# url for modelarts +data_url: "" +train_url: "" +checkpoint_url: "" +# path for local +data_path: "/cache/data" +output_path: "/cache/train" +load_path: "/cache/checkpoint_path" +device_target: "GPU" +enable_profiling: False + +# ====================================================================================== +# common options +num_classes: 1000 +label_smooth_factor: 0.1 +model_size: "2.0x" +device_id: 0 + +# ====================================================================================== +# Training options +epoch_size: 250 +keep_checkpoint_max: 5 +save_ckpt_path: "./" +save_checkpoint_epochs: 1 +save_checkpoint: True +amp_level: "O2" +is_distributed: False +train_dataset_path: "" +resume: "" + +# Dataset config +batch_size: 128 + +#learning rate config +decay_method: "cosine" +lr_init: 0.00 +lr_max: 0.50 +lr_end: 0.00 +warmup_epochs: 4 +loss_scale: 1024 + +#optimization config +weight_decay: 0.00004 +momentum: 0.9 + +# ====================================================================================== +# Eval options +ckpt_path: "" +eval_dataset_path: "" + +# ====================================================================================== +# export options +file_name: "shufflenetv1" +file_format: "MINDIR" + +--- +# Help description for each configuration +enable_modelarts: "Whether training on modelarts default: False" +data_url: "Url for modelarts" +train_url: "Url for modelarts" +data_path: "The location of input data" +output_pah: "The location of the output file" +device_target: "device id of GPU or Ascend. (Default: None)" +enable_profiling: "Whether enable profiling while training default: False" +is_distributed: "distributed training" +resume: "resume training with existed checkpoint" +model_size: "shuffleNetV1 model size choices 2.0x, 1.5x, 1.0x, 0.5x" +device_id: "device id" +file_name: "output file name" +file_format: "file format choices [AIR MINDIR ONNX]" diff --git a/model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh new file mode 100644 index 00000000000..55c2e6ea0cf --- /dev/null +++ b/model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh @@ -0,0 +1,55 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# run as sh scripts/run_distribute_train.sh RANK_SIZE TRAIN_DATA_DIR +# limitations under the License. +# ============================================================================ +if [ $# != 2 ]; then + echo "Usage: sh run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR]" + exit 1 +fi + +get_real_path() { + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} +export RANK_SIZE=$1 +export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7 +TRAIN_DATA_DIR=$(get_real_path $2) + +if [ ! -d $TRAIN_DATA_DIR ]; then + echo "error: TRAIN_DATA_DIR=$TRAIN_DATA_DIR is not a directory" + exit 1 +fi + +if [ -d "distribute_train" ]; then + rm -rf ./distribute_train +fi + +mkdir ./distribute_train +cp ./*.py ./distribute_train +cp ./*.yaml ./distribute_train +cp -r ./src ./distribute_train +cd ./distribute_train || exit + +mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \ +nohup python train.py \ + --is_distributed=True \ + --config_path=./gpu_default_config.yaml \ + --train_dataset_path=$TRAIN_DATA_DIR \ + --device_target=GPU > log.txt 2>&1 & +cd .. + diff --git a/model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh b/model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh new file mode 100644 index 00000000000..c9553a3d962 --- /dev/null +++ b/model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh @@ -0,0 +1,42 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# run as sh scripts/run_eval.sh DEVICE_ID DATA_DIR PATH_CHECKPOINT +# limitations under the License. +# ============================================================================ +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +DEVICE_ID=$1 +DATA_DIR=$(get_real_path $2) +PATH_CHECKPOINT=$(get_real_path $3) +export CUDA_VISIBLE_DEVICES=$DEVICE_ID + +BASEPATH=$(cd "`dirname $0`" || exit; pwd) +CONFIG_FILE="${BASEPATH}/../gpu_default_config.yaml" + +if [ -d "../eval" ]; then + rm -rf ../eval +fi +mkdir ../eval +cd ../eval || exit + +python ${BASEPATH}/../eval.py \ + --config_path=$CONFIG_FILE \ + --ckpt_path=$PATH_CHECKPOINT \ + --eval_dataset_path=$DATA_DIR > eval.log 2>&1 & diff --git a/model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh new file mode 100644 index 00000000000..a34cb269359 --- /dev/null +++ b/model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh @@ -0,0 +1,43 @@ +#!/bin/bash +# Copyright 2021 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# run as sh scripts/run_standalone_train.sh DEVICE_ID DATA_DIR +# limitations under the License. +# ============================================================================ +get_real_path(){ + if [ "${1:0:1}" == "/" ]; then + echo "$1" + else + echo "$(realpath -m $PWD/$1)" + fi +} + +DEVICE_ID=$1 +DATA_DIR=$(get_real_path $2) +export CUDA_VISIBLE_DEVICES=$DEVICE_ID + +BASEPATH=$(cd ./"`dirname $0`" || exit; pwd) +CONFIG_FILE="$BASEPATH/../gpu_default_config.yaml" +train_path=train_standalone${DEVICE_ID} + +if [ -d ${train_path} ]; then + rm -rf ${train_path} +fi +mkdir -p ${train_path} +echo "start training for device $DEVICE_ID" +cd ${train_path}|| exit + +python ${BASEPATH}/../train.py \ + --config_path=$CONFIG_FILE \ + --train_dataset_path=$DATA_DIR > log.txt 2>&1 & + diff --git a/model_zoo/official/cv/shufflenetv1/train.py b/model_zoo/official/cv/shufflenetv1/train.py index 7ad9818bb82..048f9bf030c 100644 --- a/model_zoo/official/cv/shufflenetv1/train.py +++ b/model_zoo/official/cv/shufflenetv1/train.py @@ -22,7 +22,7 @@ from mindspore.nn.optim.momentum import Momentum from mindspore.train.model import Model, ParallelMode from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor from mindspore.train.serialization import load_checkpoint, load_param_into_net -from mindspore.communication.management import init, get_group_size +from mindspore.communication.management import init, get_rank, get_group_size from mindspore.train.loss_scale_manager import FixedLossScaleManager from src.lr_generator import get_lr from src.shufflenetv1 import ShuffleNetV1 @@ -30,8 +30,7 @@ from src.dataset import create_dataset from src.crossentropysmooth import CrossEntropySmooth from src.model_utils.config import config from src.model_utils.moxing_adapter import moxing_wrapper -from src.model_utils.device_adapter import get_device_id, get_rank_id - +from src.model_utils.device_adapter import get_device_id set_seed(1) @@ -49,7 +48,7 @@ def train(): if os.getenv('DEVICE_ID', "not_set").isdigit(): context.set_context(device_id=get_device_id()) init() - rank = get_rank_id() + rank = get_rank() group_size = get_group_size() parallel_mode = ParallelMode.DATA_PARALLEL context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True)