From d854e28ad85d816e8d8f6a06823479754669e8b5 Mon Sep 17 00:00:00 2001
From: chenweitao_295 <chenweitao4@huawei.com>
Date: Wed, 14 Jul 2021 15:19:15 +0800
Subject: [PATCH] shufflenetv1 add gpu mode

---
 .../official/cv/shufflenetv1/README_CN.md     | 65 ++++++++++++-----
 .../cv/shufflenetv1/gpu_default_config.yaml   | 72 +++++++++++++++++++
 .../scripts/run_distribute_train_gpu.sh       | 55 ++++++++++++++
 .../cv/shufflenetv1/scripts/run_eval_gpu.sh   | 42 +++++++++++
 .../scripts/run_standalone_train_gpu.sh       | 43 +++++++++++
 model_zoo/official/cv/shufflenetv1/train.py   |  7 +-
 6 files changed, 262 insertions(+), 22 deletions(-)
 create mode 100644 model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml
 create mode 100644 model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh
 create mode 100644 model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh
 create mode 100644 model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh

diff --git a/model_zoo/official/cv/shufflenetv1/README_CN.md b/model_zoo/official/cv/shufflenetv1/README_CN.md
index 6f5cf9e9355..6c9aceba069 100644
--- a/model_zoo/official/cv/shufflenetv1/README_CN.md
+++ b/model_zoo/official/cv/shufflenetv1/README_CN.md
@@ -62,8 +62,11 @@ ShuffleNetV1的核心部分被分成三个阶段，每个阶段重复堆积了
   ├─README_CN.md                              # ShuffleNetV1相关描述
   ├─scripts
     ├─run_standalone_train.sh                 # Ascend环境下的单卡训练脚本
+    ├─run_standalone_train_gpu.sh             # GPU环境下的单卡训练脚本
     ├─run_distribute_train.sh                 # Ascend环境下的八卡并行训练脚本
+    ├─run_distribute_train_gpu.sh             # GPU环境下的八卡并行训练脚本
     ├─run_eval.sh                             # Ascend环境下的评估脚本
+    ├─run_eval_gpu.sh                             # GPU环境下的评估脚本
     ├─run_infer_310.sh                        # Ascend 310 推理shell脚本
   ├─src
     ├─dataset.py                              # 数据预处理
@@ -76,6 +79,7 @@ ShuffleNetV1的核心部分被分成三个阶段，每个阶段重复堆积了
       ├──local_adapter.py                     # 设备相关信息
       ├──moxing_adapter.py                    # 装饰器(主要用于ModelArts数据拷贝)
   ├─default_config.yaml                       # 参数文件
+  ├─gpu_default_config.yaml                   # GPU参数文件
   ├─train.py                                  # 网络训练脚本
   ├─export.py                                 # 模型格式转换脚本
   ├─eval.py                                   # 网络评估脚本
@@ -107,6 +111,8 @@ ShuffleNetV1的核心部分被分成三个阶段，每个阶段重复堆积了
 'momentum': 0.9                     # Momentum中的动量参数
 ```
 
+如需获取更多信息，Ascend请查看`default_config.yaml`, GPU请查看`gpu_default_config.yaml`.
+
 ## 训练过程
 
 ### 启动
@@ -115,12 +121,30 @@ ShuffleNetV1的核心部分被分成三个阶段，每个阶段重复堆积了
 
 ```shell
 # 训练示例
+- running on Ascend with default parameters
+
   python:
       Ascend单卡训练示例：python train.py --train_dataset_path [DATA_DIR]
 
   shell:
       Ascend八卡并行训练: sh scripts/run_distribute_train.sh [RANK_TABLE_FILE] [DATA_DIR]
       Ascend单卡训练示例: sh scripts/run_standalone_train.sh [DEVICE_ID] [DATA_DIR]
+
+- running on GPU with gpu default parameters
+
+  python:
+      GPU单卡训练示例：python train.py --config_path [CONFIG_PATH] --device_target [DEVICE_TARGET]
+      GPU八卡训练示例：
+          export RANK_SIZE=8
+          mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
+          python train.py --config_path [CONFIG_PATH] \
+                          --train_dataset_path [TRAIN_DATA_DIR] \
+                          --is_distributed=True \
+                          --device_target=GPU > log.txt 2>&1 &
+
+  shell:
+      GPU单卡训练示例: sh scripts/run_standalone_train_gpu.sh [DEVICE_ID] [DATA_DIR]
+      GPU八卡并行训练: sh scripts/run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR]
 ```
 
   分布式训练需要提前创建JSON格式的HCCL配置文件。
@@ -148,15 +172,20 @@ epoch time: 99864.092, per step time: 79.827, avg loss: 3.442
 您可以使用python或shell脚本进行评估。
 
 ```shell
-# 评估示例
+# Ascend评估示例
   python:
       python eval.py --eval_dataset_path [DATA_DIR] --ckpt_path [PATH_CHECKPOINT]
 
   shell:
       sh scripts/run_eval.sh [DEVICE_ID] [DATA_DIR] [PATH_CHECKPOINT]
-```
 
-> 训练过程中可以生成ckpt文件。
+# GPU评估示例
+  python:
+      python eval.py --config_path [CONFIG_PATH] --eval_dataset_path [DATA_DIR] --ckpt_path [PATH_CHECKPOINT]
+
+  shell:
+      sh scripts/run_eval_gpu.sh [DEVICE_ID] [DATA_DIR] [PATH_CHECKPOINT]
+```
 
 ### 结果
 
@@ -272,21 +301,21 @@ Densenet121网络使用ImageNet推理得到的结果如下:
 
 ## 训练性能
 
-| 参数                        | Ascend                                |
-| -------------------------- | ------------------------------------- |
-| 模型名称                    | ShuffleNetV1                           |
-| 运行环境                    | Ascend 910；系统 Euler2.8                            |
-| 上传时间                    | 2020-12-3                             |
-| MindSpore 版本             | 1.0.0                                 |
-| 数据集                      | imagenet                              |
-| 训练参数                    | src/config.py                         |
-| 优化器                      | Momentum                              |
-| 损失函数                    | SoftmaxCrossEntropyWithLogits         |
-| 最终损失                    | 2.05                                  |
-| 精确度 (8p)                 | Top1[73.9%], Top5[91.4%]               |
-| 训练总时间 (8p)             | 7.0h                                    |
-| 评估总时间                  | 99s                                    |
-| 参数量 (M)                 | 44M                                   |
+| 参数                        | Ascend                                | GPU                                |
+| -------------------------- | ------------------------------------- | -------------------------- |
+| 模型名称                    | ShuffleNetV1                           | ShuffleNetV1                           |
+| 运行环境                    | Ascend 910；系统 Euler2.8               | Tesla V100；系统 Euler2.8                            |
+| 上传时间                    | 2020-12-3                             | 2021-07-15                            |
+| MindSpore 版本             | 1.0.0                                 | 1.3.0                                 |
+| 数据集                      | imagenet                              | imagenet                              |
+| 训练参数                    | default_config.yaml                    | gpu_default_config.yaml                |
+| 优化器                      | Momentum                              | Momentum                              |
+| 损失函数                    | SoftmaxCrossEntropyWithLogits         | SoftmaxCrossEntropyWithLogits         |
+| 最终损失                    | 2.05                                  | 2.04                                  |
+| 精确度 (8p)                 | Top1[73.9%], Top5[91.4%]               | Top1[73.8%], Top5[91.4%]               |
+| 训练总时间 (8p)             | 7.0h                                    | 20.0h                                    |
+| 评估总时间                  | 99s                                    | 58s                                    |
+| 参数量 (M)                 | 44M                                   | 51.3M                                   |
 | 脚本                       | [链接](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/official/cv/shufflenetv1) |
 
 # 随机情况的描述
diff --git a/model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml b/model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml
new file mode 100644
index 00000000000..62e98ab50ec
--- /dev/null
+++ b/model_zoo/official/cv/shufflenetv1/gpu_default_config.yaml
@@ -0,0 +1,72 @@
+# Builtin Configurations(DO NOT CHANGE THESE CONFIGURATIONS unlesee you know exactly what you are doing)
+enable_modelarts: False
+# url for modelarts
+data_url: ""
+train_url: ""
+checkpoint_url: ""
+# path for local
+data_path: "/cache/data"
+output_path: "/cache/train"
+load_path: "/cache/checkpoint_path"
+device_target: "GPU"
+enable_profiling: False
+
+# ======================================================================================
+# common options
+num_classes: 1000
+label_smooth_factor: 0.1
+model_size: "2.0x"
+device_id: 0
+
+# ======================================================================================
+# Training options
+epoch_size: 250
+keep_checkpoint_max: 5
+save_ckpt_path: "./"
+save_checkpoint_epochs: 1
+save_checkpoint: True
+amp_level: "O2"
+is_distributed: False
+train_dataset_path: ""
+resume: ""
+
+# Dataset config
+batch_size: 128
+
+#learning rate config
+decay_method: "cosine"
+lr_init: 0.00
+lr_max: 0.50
+lr_end: 0.00
+warmup_epochs: 4
+loss_scale: 1024
+
+#optimization config
+weight_decay: 0.00004
+momentum: 0.9
+
+# ======================================================================================
+# Eval options
+ckpt_path: ""
+eval_dataset_path: ""
+
+# ======================================================================================
+# export options
+file_name: "shufflenetv1"
+file_format: "MINDIR"
+
+---
+# Help description for each configuration
+enable_modelarts: "Whether training on modelarts default: False"
+data_url: "Url for modelarts"
+train_url: "Url for modelarts"
+data_path: "The location of input data"
+output_pah: "The location of the output file"
+device_target: "device id of GPU or Ascend. (Default: None)"
+enable_profiling: "Whether enable profiling while training default: False"
+is_distributed: "distributed training"
+resume: "resume training with existed checkpoint"
+model_size: "shuffleNetV1 model size choices 2.0x, 1.5x, 1.0x, 0.5x"
+device_id: "device id"
+file_name: "output file name"
+file_format: "file format choices [AIR MINDIR ONNX]"
diff --git a/model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh b/model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh
new file mode 100644
index 00000000000..55c2e6ea0cf
--- /dev/null
+++ b/model_zoo/official/cv/shufflenetv1/scripts/run_distribute_train_gpu.sh
@@ -0,0 +1,55 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# run as sh scripts/run_distribute_train.sh RANK_SIZE TRAIN_DATA_DIR
+# limitations under the License.
+# ============================================================================
+if [ $# != 2 ]; then
+  echo "Usage: sh run_distribute_train_gpu.sh [RANK_SIZE] [TRAIN_DATA_DIR]"
+  exit 1
+fi
+
+get_real_path() {
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+export RANK_SIZE=$1
+export CUDA_VISIBLE_DEVICES=0,1,2,3,4,5,6,7
+TRAIN_DATA_DIR=$(get_real_path $2)
+
+if [ ! -d $TRAIN_DATA_DIR ]; then
+  echo "error: TRAIN_DATA_DIR=$TRAIN_DATA_DIR is not a directory"
+  exit 1
+fi
+
+if [ -d "distribute_train" ]; then
+  rm -rf ./distribute_train
+fi
+
+mkdir ./distribute_train
+cp ./*.py ./distribute_train
+cp ./*.yaml ./distribute_train
+cp -r ./src ./distribute_train
+cd ./distribute_train || exit
+
+mpirun --allow-run-as-root -n $RANK_SIZE --output-filename log_output --merge-stderr-to-stdout \
+nohup python train.py  \
+  --is_distributed=True \
+  --config_path=./gpu_default_config.yaml \
+  --train_dataset_path=$TRAIN_DATA_DIR \
+  --device_target=GPU > log.txt 2>&1 &
+cd ..
+
diff --git a/model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh b/model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh
new file mode 100644
index 00000000000..c9553a3d962
--- /dev/null
+++ b/model_zoo/official/cv/shufflenetv1/scripts/run_eval_gpu.sh
@@ -0,0 +1,42 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# run as sh scripts/run_eval.sh DEVICE_ID DATA_DIR PATH_CHECKPOINT
+# limitations under the License.
+# ============================================================================
+get_real_path(){
+    if [ "${1:0:1}" == "/" ]; then
+        echo "$1"
+    else
+        echo "$(realpath -m $PWD/$1)"
+    fi
+}
+
+DEVICE_ID=$1
+DATA_DIR=$(get_real_path $2)
+PATH_CHECKPOINT=$(get_real_path $3)
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+CONFIG_FILE="${BASEPATH}/../gpu_default_config.yaml"
+
+if [ -d "../eval" ]; then
+    rm -rf ../eval
+fi
+mkdir ../eval
+cd ../eval || exit
+
+python ${BASEPATH}/../eval.py \
+    --config_path=$CONFIG_FILE \
+    --ckpt_path=$PATH_CHECKPOINT \
+    --eval_dataset_path=$DATA_DIR > eval.log 2>&1 &
diff --git a/model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh b/model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh
new file mode 100644
index 00000000000..a34cb269359
--- /dev/null
+++ b/model_zoo/official/cv/shufflenetv1/scripts/run_standalone_train_gpu.sh
@@ -0,0 +1,43 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# run as sh scripts/run_standalone_train.sh DEVICE_ID DATA_DIR
+# limitations under the License.
+# ============================================================================
+get_real_path(){
+    if [ "${1:0:1}" == "/" ]; then
+        echo "$1"
+    else
+        echo "$(realpath -m $PWD/$1)"
+    fi
+}
+
+DEVICE_ID=$1
+DATA_DIR=$(get_real_path $2)
+export CUDA_VISIBLE_DEVICES=$DEVICE_ID
+
+BASEPATH=$(cd ./"`dirname $0`" || exit; pwd)
+CONFIG_FILE="$BASEPATH/../gpu_default_config.yaml"
+train_path=train_standalone${DEVICE_ID}
+
+if [ -d ${train_path} ]; then
+  rm -rf ${train_path}
+fi
+mkdir -p ${train_path}
+echo "start training for device $DEVICE_ID"
+cd ${train_path}|| exit
+
+python ${BASEPATH}/../train.py  \
+    --config_path=$CONFIG_FILE \
+    --train_dataset_path=$DATA_DIR > log.txt 2>&1 &
+
diff --git a/model_zoo/official/cv/shufflenetv1/train.py b/model_zoo/official/cv/shufflenetv1/train.py
index 7ad9818bb82..048f9bf030c 100644
--- a/model_zoo/official/cv/shufflenetv1/train.py
+++ b/model_zoo/official/cv/shufflenetv1/train.py
@@ -22,7 +22,7 @@ from mindspore.nn.optim.momentum import Momentum
 from mindspore.train.model import Model, ParallelMode
 from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor, LossMonitor
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
-from mindspore.communication.management import init, get_group_size
+from mindspore.communication.management import init, get_rank, get_group_size
 from mindspore.train.loss_scale_manager import FixedLossScaleManager
 from src.lr_generator import get_lr
 from src.shufflenetv1 import ShuffleNetV1
@@ -30,8 +30,7 @@ from src.dataset import create_dataset
 from src.crossentropysmooth import CrossEntropySmooth
 from src.model_utils.config import config
 from src.model_utils.moxing_adapter import moxing_wrapper
-from src.model_utils.device_adapter import get_device_id, get_rank_id
-
+from src.model_utils.device_adapter import get_device_id
 
 set_seed(1)
 
@@ -49,7 +48,7 @@ def train():
         if os.getenv('DEVICE_ID', "not_set").isdigit():
             context.set_context(device_id=get_device_id())
         init()
-        rank = get_rank_id()
+        rank = get_rank()
         group_size = get_group_size()
         parallel_mode = ParallelMode.DATA_PARALLEL
         context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=group_size, gradients_mean=True)