!5601 fix shufflenet_scripts

Merge pull request !5601 from panfengfeng/fix_shufflenet_scripts
2020-09-01 09:17:18 +08:00 · 2020-09-01 09:17:18 +08:00 · 0f344f33e2
parent d92c220cc0 121943bdb5
commit 0f344f33e2
7 changed files with 114 additions and 26 deletions
--- a/model_zoo/official/cv/shufflenetv2/Readme.md
+++ b/model_zoo/official/cv/shufflenetv2/Readme.md
@ -55,7 +55,7 @@ Dataset used: [imagenet](http://www.image-net.org/)
  +-- Readme.md     # descriptions about ShuffleNetV2
  +-- scripts
  ¦   +--run_distribute_train_for_gpu.sh   # shell script for distributed training
-  ¦   +--run_eval_for_multi_gpu.sh         # shell script for evaluation
+  ¦   +--run_eval_for_gpu.sh         # shell script for evaluation
  ¦   +--run_standalone_train_for_gpu.sh   # shell script for standalone training
  +-- src
  ¦   +--config.py      # parameter configuration
@ -75,23 +75,23 @@ Dataset used: [imagenet](http://www.image-net.org/)

 You can start training using python or shell scripts. The usage of shell scripts as follows:

- Ditributed training on GPU: sh run_distribute_train_for_gpu.sh [DATA_DIR]
- Standalone training on GPU: sh run_standalone_train_for_gpu.sh [DEVICE_ID] [DATA_DIR]
+- Ditributed training on GPU: sh run_standalone_train_for_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH]
+- Standalone training on GPU: sh run_standalone_train_for_gpu.sh [DATASET_PATH]

 ### Launch

 ```
 # training example
  python:
-      GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed --platform 'GPU' --dataset_path '~/imagenet/train/' > train.log 2>&1 &
+      GPU: mpirun --allow-run-as-root -n 8 python train.py --is_distributed=True --platform='GPU' --dataset_path='~/imagenet/train/' > train.log 2>&1 &

  shell:
-      GPU: sh run_distribute_train_for_gpu.sh ~/imagenet/train/
+      GPU: cd scripts & sh run_distribute_train_for_gpu.sh 8 0,1,2,3,4,5,6,7 ~/imagenet/train/
 ```

 ### Result

-Training result will be stored in the example path. Checkpoints will be stored at `. /checkpoint` by default, and training log  will be redirected to `./train/train.log`.
+Training result will be stored in the example path. Checkpoints will be stored at `./checkpoint` by default, and training log will be redirected to `./train/train.log`.

 ## [Eval process](#contents)

@ -99,21 +99,21 @@ Training result will be stored in the example path. Checkpoints will be stored a

 You can start evaluation using python or shell scripts. The usage of shell scripts as follows:

- GPU: sh run_eval_for_multi_gpu.sh [DEVICE_ID] [EPOCH]
+- GPU: sh run_eval_for_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]

 ### Launch

 ``` 
 # infer example
  python:
-      GPU: CUDA_VISIBLE_DEVICES=0 python eval.py --platform 'GPU' --dataset_path '~/imagenet/val/' --epoch 250 > eval.log 2>&1 &
+      GPU: CUDA_VISIBLE_DEVICES=0 python eval.py --platform='GPU' --dataset_path='~/imagenet/val/' > eval.log 2>&1 &

  shell:
-      GPU: sh run_eval_for_multi_gpu.sh 0 250
+      GPU: cd scripts & sh run_eval_for_gpu.sh '~/imagenet/val/' 'checkpoint_file' 
 ```

 > checkpoint can be produced in training process.

 ### Result

-Inference result will be stored in the example path, you can find result in `val.log`.
+Inference result will be stored in the example path, you can find result in `eval.log`.
--- a/model_zoo/official/cv/shufflenetv2/eval.py
+++ b/model_zoo/official/cv/shufflenetv2/eval.py
@ -31,7 +31,6 @@ if __name__ == '__main__':
    parser.add_argument('--checkpoint', type=str, default='', help='checkpoint of ShuffleNetV2 (Default: None)')
    parser.add_argument('--dataset_path', type=str, default='', help='Dataset path')
    parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform')
-    parser.add_argument('--epoch', type=str, default='')
    args_opt = parser.parse_args()

    if args_opt.platform == 'Ascend':
@ -43,7 +42,7 @@ if __name__ == '__main__':
    ckpt = load_checkpoint(args_opt.checkpoint)
    load_param_into_net(net, ckpt)
    net.set_train(False)
-    dataset = create_dataset(args_opt.dataset_path, cfg, False)
+    dataset = create_dataset(args_opt.dataset_path, False, 0, 1)
    loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False,
                                            smooth_factor=0.1, num_classes=cfg.num_classes)
    eval_metrics = {'Loss': nn.Loss(),
--- a/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh
+++ b/model_zoo/official/cv/shufflenetv2/scripts/run_distribute_train_for_gpu.sh
@ -13,5 +13,45 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-DATA_DIR=$1
-mpirun --allow-run-as-root -n 8 python ./train.py --is_distributed --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
+if [ $# -lt 3 ]
+then
+    echo "Usage: \
+          sh run_distribute_train_for_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] \
+          "
+exit 1
+fi
+
+if [ $1 -lt 1 ] && [ $1 -gt 8 ]
+then
+    echo "error: DEVICE_NUM=$1 is not in (1-8)"
+exit 1
+fi
+
+# check dataset file
+if [ ! -d $3 ]
+then
+    echo "error: DATASET_PATH=$3 is not a directory"    
+exit 1
+fi
+
+export DEVICE_NUM=$1
+export RANK_SIZE=$1
+
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+if [ -d "../train" ];
+then
+    rm -rf ../train
+fi
+mkdir ../train
+cd ../train || exit
+
+export CUDA_VISIBLE_DEVICES="$2"
+
+if [ $1 -gt 1 ]
+then
+    mpirun -n $1 --allow-run-as-root \
+    python ${BASEPATH}/../train.py --platform='GPU' --is_distributed=True --dataset_path=$3 > train.log 2>&1 &
+else
+    python ${BASEPATH}/../train.py --platform='GPU' --dataset_path=$3 > train.log 2>&1 &
+fi
--- a/model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_multi_gpu.sh
+++ b/model_zoo/official/cv/shufflenetv2/scripts/run_eval_for_multi_gpu.sh
@ -13,6 +13,35 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-DEVICE_ID=$1
-EPOCH=$2
-CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./eval.py --platform 'GPU' --dataset_path '/home/data/ImageNet_Original/val/' --epoch $EPOCH > eval.log 2>&1 &
+if [ $# != 2 ]
+then
+    echo "GPU: sh run_eval_for_gpu.sh [DATASET_PATH] [CHECKPOINT_PATH]"
+exit 1
+fi
+
+# check dataset file
+if [ ! -d $1 ]
+then
+    echo "error: DATASET_PATH=$1 is not a directory"    
+exit 1
+fi
+
+# check checkpoint file
+if [ ! -f $2 ]
+then
+    echo "error: CHECKPOINT_PATH=$2 is not a file"    
+exit 1
+fi
+
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+export DEVICE_ID=0
+
+if [ -d "../eval" ];
+then
+    rm -rf ../eval
+fi
+mkdir ../eval
+cd ../eval || exit
+
+python ${BASEPATH}/../eval.py --dataset_path=$1 --checkpoint=$2 > ./eval.log 2>&1 &
--- a/model_zoo/official/cv/shufflenetv2/scripts/run_standalone_train_for_gpu.sh
+++ b/model_zoo/official/cv/shufflenetv2/scripts/run_standalone_train_for_gpu.sh
@ -13,6 +13,28 @@
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
-DEVICE_ID=$1
-DATA_DIR=$2
-CUDA_VISIBLE_DEVICES=$DEVICE_ID python ./train.py --platform 'GPU' --dataset_path $DATA_DIR > train.log 2>&1 &
+if [ $# -lt 1 ]
+then
+    echo "Usage: \
+          sh run_standalone_train_for_gpu.sh [DATASET_PATH] \
+          "
+exit 1
+fi
+
+# check dataset file
+if [ ! -d $1 ]
+then
+    echo "error: DATASET_PATH=$1 is not a directory"    
+exit 1
+fi
+
+BASEPATH=$(cd "`dirname $0`" || exit; pwd)
+export PYTHONPATH=${BASEPATH}:$PYTHONPATH
+if [ -d "../train" ];
+then
+    rm -rf ../train
+fi
+mkdir ../train
+cd ../train || exit
+
+python ${BASEPATH}/../train.py --platform='GPU' --dataset_path=$1 > train.log 2>&1 &
--- a/model_zoo/official/cv/shufflenetv2/src/dataset.py
+++ b/model_zoo/official/cv/shufflenetv2/src/dataset.py
@ -75,7 +75,5 @@ def create_dataset(dataset_path, do_train, rank, group_size, repeat_num=1):
    ds = ds.map(input_columns="label", operations=type_cast_op, num_parallel_workers=cfg.work_nums)
    # apply batch operations
    ds = ds.batch(cfg.batch_size, drop_remainder=True)
-    # apply dataset repeat operation
-    ds = ds.repeat(repeat_num)

    return ds
--- a/model_zoo/official/cv/shufflenetv2/train.py
+++ b/model_zoo/official/cv/shufflenetv2/train.py
@ -14,6 +14,7 @@
 # ============================================================================
 """train_imagenet."""
 import argparse
+import ast
 import os
 import random
 import numpy as np
@ -23,7 +24,7 @@ from network import ShuffleNetV2
 import mindspore.nn as nn
 from mindspore import context
 from mindspore import dataset as de
-from mindspore import ParallelMode
+from mindspore.context import ParallelMode
 from mindspore import Tensor
 from mindspore.communication.management import init, get_rank, get_group_size
 from mindspore.nn.optim.momentum import Momentum
@ -42,10 +43,9 @@ de.config.set_seed(cfg.random_seed)

 if __name__ == '__main__':
    parser = argparse.ArgumentParser(description='image classification training')
-    parser.add_argument('--dataset_path', type=str, default='/home/data/imagenet_jpeg/train/', help='Dataset path')
+    parser.add_argument('--dataset_path', type=str, default='', help='Dataset path')
    parser.add_argument('--resume', type=str, default='', help='resume training with existed checkpoint')
-    parser.add_argument('--is_distributed', action='store_true', default=False,
-                        help='distributed training')
+    parser.add_argument('--is_distributed', type=ast.literal_eval, default=False, help='distributed training')
    parser.add_argument('--platform', type=str, default='GPU', choices=('Ascend', 'GPU'), help='run platform')
    parser.add_argument('--model_size', type=str, default='1.0x', help='ShuffleNetV2 model size parameter')
    args_opt = parser.parse_args()