GPU script adaptation

2021-07-15 09:22:48 +08:00 · 2021-07-15 09:22:48 +08:00 · f841c25f0a
parent c9af7643d7
commit f841c25f0a
7 changed files with 264 additions and 45 deletions
--- a/model_zoo/research/cv/FaceAttribute/README.md
+++ b/model_zoo/research/cv/FaceAttribute/README.md
@ -112,8 +112,12 @@ The entire code structure is as following:
  ├─ scripts
    ├─ run_standalone_train.sh              # launch standalone training(1p) in ascend
    ├─ run_distribute_train.sh              # launch distributed training(8p) in ascend
+    ├─ run_standalone_train_gpu.sh          # launch standalone training(1p) in GPU
+    ├─ run_distribute_train_gpu.sh          # launch distributed training(8p) in GPU
    ├─ run_eval.sh                          # launch evaluating in ascend
+    ├─ run_eval_gpu.sh                      # launch evaluating in gpu
    └─ run_export.sh                        # launch exporting air model
+    ├─ run_infer_310.sh                     # shell script for 310 inference
  ├─ src
    ├─ FaceAttribute
      ├─ cross_entropy.py                   # cross entroy loss
@ -144,14 +148,14 @@ The entire code structure is as following:

 - Stand alone mode

-    ```bash
+    ```bash Ascend
    cd ./scripts
    sh run_standalone_train.sh [MINDRECORD_FILE] [USE_DEVICE_ID]
    ```

    or (fine-tune)

-    ```bash
+    ```bash Ascend
    cd ./scripts
    sh run_standalone_train.sh [MINDRECORD_FILE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
    ```
@ -163,27 +167,65 @@ The entire code structure is as following:
    sh run_standalone_train.sh /home/train.mindrecord 0 /home/a.ckpt
    ```

- Distribute mode (recommended)
-
-    ```bash
+    ```bash GPU
    cd ./scripts
-    sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE]
+    sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES]
    ```

    or (fine-tune)

-    ```bash
+    ```bash GPU
    cd ./scripts
-    sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE] [PRETRAINED_BACKBONE]
+    sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]
    ```

    for example:

    ```bash
    cd ./scripts
+    sh run_standalone_train_gpu.sh /home/train.mindrecord 0 /home/a.ckpt
+    ```
+
+- Distribute mode (recommended)
+
+    ```bash Ascend
+    cd ./scripts
+    sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE]
+    ```
+
+    or (fine-tune)
+
+    ```bash Ascend
+    cd ./scripts
+    sh run_distribute_train.sh [MINDRECORD_FILE] [RANK_TABLE] [PRETRAINED_BACKBONE]
+    ```
+
+    for example:
+
+    ```bash Ascend
+    cd ./scripts
    sh run_distribute_train.sh /home/train.mindrecord ./rank_table_8p.json /home/a.ckpt
    ```

+    ```bash GPU
+    cd ./scripts
+    sh run_distribute_train_gpu.sh [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE]
+    ```
+
+    or (fine-tune)
+
+    ```bash GPU
+    cd ./scripts
+    sh run_distribute_train_gpu.sh [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE] [PRETRAINED_BACKBONE]
+    ```
+
+    for example:
+
+    ```bash GPU
+    cd ./scripts
+    sh run_distribute_train_gpu.sh 8 0,1,2,3,4,5,6,7 /home/train.mindrecord ./rank_table_8p.json /home/a.ckpt
+    ```
+
 You will get the loss value of each step as following in "./output/[TIME]/[TIME].log" or "./scripts/device0/train.log":

 ```python
@ -285,14 +327,26 @@ epoch[69], iter[6150], loss:1.167064, 9300.77 imgs/sec

 ### Evaluation

-```bash
+```bash Ascend
 cd ./scripts
 sh run_eval.sh [MINDRECORD_FILE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
 ```

 for example:

-```bash
+```bash Ascend
+cd ./scripts
+sh run_eval.sh /home/eval.mindrecord 0 /home/a.ckpt
+```
+
+```bash GPU
+cd ./scripts
+sh run_eval_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]
+```
+
+for example:
+
+```bash GPU
 cd ./scripts
 sh run_eval.sh /home/eval.mindrecord 0 /home/a.ckpt
 ```
@ -315,7 +369,7 @@ mask f1:  0.9992691394116572

 If you want to infer the network on Ascend 310, you should convert the model to AIR:

-```bash
+```bash Ascend
 cd ./scripts
 sh run_export.sh [BATCH_SIZE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
 ```
@ -325,7 +379,7 @@ sh run_export.sh [BATCH_SIZE] [USE_DEVICE_ID] [PRETRAINED_BACKBONE]
 #### Export MindIR

 ```shell
-python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT]
+python export.py --ckpt_file [CKPT_PATH] --file_name [FILE_NAME] --file_format [FILE_FORMAT] --device_target [device_target]
 ```

 The ckpt_file parameter is required,
@ -362,36 +416,36 @@ Inference result is saved in current path, you can find result like this in acc.

 ### Training Performance

-| Parameters                 | Face Attribute                                              |
-| -------------------------- | ----------------------------------------------------------- |
-| Model Version              | V1                                                          |
-| Resource                   | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8             |
-| uploaded Date              | 09/30/2020 (month/day/year)                                 |
-| MindSpore Version          | 1.0.0                                                       |
-| Dataset                    | 91K images                                                  |
-| Training Parameters        | epoch=70, batch_size=128, momentum=0.9, lr=0.001            |
-| Optimizer                  | Momentum                                                    |
-| Loss Function              | Softmax Cross Entropy                                       |
-| outputs                    | probability                                                 |
-| Speed                      | 1pc: 200~250 ms/step; 8pcs: 100~150 ms/step                 |
-| Total time                 | 1pc: 2.5 hours; 8pcs: 0.3 hours                             |
-| Checkpoint for Fine tuning | 88M (.ckpt file)                                            |
+| Parameters                 | Face Attribute                                              |  Face Attribute                                             |
+| -------------------------- | ----------------------------------------------------------- | ----------------------------------------------------------- |
+| Model Version              | V1                                                          | V1                                                          |
+| Resource                   | Ascend 910; CPU 2.60GHz, 192cores; Memory 755G; OS Euler2.8 | Tesla V100-PICE-32G                                         |
+| uploaded Date              | 09/30/2020 (month/day/year)                                 | 07/19/2021 (month/day/year)                                 |
+| MindSpore Version          | 1.0.0                                                       | 1.3.0                                                       |
+| Dataset                    | 91K images                                                  | 91K images                                                  |
+| Training Parameters        | epoch=70, batch_size=128, momentum=0.9, lr=0.001            | epoch=70, batch_size=128, momentum=0.9, lr=0.001            |
+| Optimizer                  | Momentum                                                    | Momentum                                                    |
+| Loss Function              | Softmax Cross Entropy                                       | Softmax Cross Entropy                                       |
+| outputs                    | probability                                                 | probability                                                 |
+| Speed                      | 1pc: 200~250 ms/step; 8pcs: 100~150 ms/step                 | 1pc: 115~125 ms/step; 8pcs: 150~200 ms/step                 |
+| Total time                 | 1pc: 2.5 hours; 8pcs: 0.3 hours                             | 1pc: 1.5 hours; 8pcs: 0.4 hours                             |
+| Checkpoint for Fine tuning | 88M (.ckpt file)                                            | 88M (.ckpt file)                                            |

 ### Evaluation Performance

-| Parameters          | Face Attribute              |
-| ------------------- | --------------------------- |
-| Model Version       | V1                          |
-| Resource            | Ascend 910; OS Euler2.8                  |
-| Uploaded Date       | 09/30/2020 (month/day/year) |
-| MindSpore Version   | 1.0.0                       |
-| Dataset             | 11K images                  |
-| batch_size          | 1                           |
-| outputs             | accuracy                    |
-| Accuracy(8pcs)      | age:45.7%                   |
-|                     | gender:89.5%                |
-|                     | mask:99.2%                  |
-| Model for inference | 88M (.ckpt file)            |
+| Parameters          | Face Attribute              | Face Attribute              |
+| ------------------- | --------------------------- | --------------------------- |
+| Model Version       | V1                          | V1                          |
+| Resource            | Ascend 910; OS Euler2.8     | Tesla V100-PICE-32G         |
+| Uploaded Date       | 09/30/2020 (month/day/year) | 07/19/2021 (month/day/year) |
+| MindSpore Version   | 1.0.0                       | 1.3.0                       |
+| Dataset             | 11K images                  | 11K images                  |
+| batch_size          | 1                           | 1                           |
+| outputs             | accuracy                    | accuracy                    |
+| Accuracy(8pcs)      | age:45.7%                   | age:49.0%                   |
+|                     | gender:89.5%                | gender:90.8%                |
+|                     | mask:99.2%                  | mask:99.3%                  |
+| Model for inference | 88M (.ckpt file)            | 88M (.ckpt file)            |

 # [ModelZoo Homepage](#contents)

--- a/model_zoo/research/cv/FaceAttribute/eval.py
+++ b/model_zoo/research/cv/FaceAttribute/eval.py
@ -33,6 +33,7 @@ from model_utils.device_adapter import get_device_id, get_device_num
 def softmax(x, axis=0):
    return np.exp(x) / np.sum(np.exp(x), axis=axis)

+
 def load_pretrain(checkpoint, network):
    '''load pretrain model.'''
    if os.path.isfile(checkpoint):
@ -51,8 +52,10 @@ def load_pretrain(checkpoint, network):
        print('-----------------------load model failed-----------------------')
    return network

+
 def modelarts_pre_process():
    '''modelarts pre process function.'''
+
    def unzip(zip_file, save_dir):
        import zipfile
        s_time = time.time()
@ -106,7 +109,8 @@ def modelarts_pre_process():
@moxing_wrapper(pre_process=modelarts_pre_process)
 def run_eval():
    '''run eval.'''
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=get_device_id())
+    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False,
+                        device_id=get_device_id())

    network = get_resnet18(config)
    ckpt_path = config.model_path
--- a/model_zoo/research/cv/FaceAttribute/export.py
+++ b/model_zoo/research/cv/FaceAttribute/export.py
@ -33,7 +33,7 @@ def modelarts_pre_process():
 def run_export():
    '''run export.'''
    devid = 0
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=devid)
+    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False, device_id=devid)

    network = get_resnet18(config)
    ckpt_path = config.ckpt_file
--- a/model_zoo/research/cv/FaceAttribute/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/research/cv/FaceAttribute/scripts/run_distribute_train_gpu.sh
@ -0,0 +1,56 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 3 ] && [ $# != 4 ]
+then
+    echo "Usage: sh run_distribute_train_gpu [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE] [PRETRAINED_BACKBONE]"
+    echo "   or: sh run_distrubute_train_gpu.sh [DEVICE_NUM] [CUDA_VISIBLE_DEVICES(0,1,2,3,4,5,6,7)] [MINDRECORD_FILE]"
+exit 1
+fi
+
+current_exec_path=$(pwd)
+echo ${current_exec_path}
+
+dirname_path=$(dirname "$(pwd)")
+echo ${dirname_path}
+
+export PYTHONPATH=${dirname_path}:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=$2
+export RANK_SIZE=$1
+
+SCRIPT_NAME='train.py'
+ulimit -c unlimited
+
+echo 'start training'
+export RANK_ID=0
+rm -rf train_distribute_gpu
+mkdir train_distribute_gpu
+cd train_distribute_gpu
+
+if [ $# == 3 ]
+then
+  mpirun -n $1 --allow-run-as-root python ${dirname_path}/${SCRIPT_NAME} \
+      --world_size=$1 \
+      --device_target='GPU' \
+      --mindrecord_path=$3 > train.log 2>&1 &
+else
+  mpirun -n $1 --allow-run-as-root python ${dirname_path}/${SCRIPT_NAME} \
+      --world_size=$1 \
+      --device_target='GPU' \
+      --mindrecord_path=$3 \
+      --pretrained=$4 > train.log  2>&1 &
+fi
+echo 'running'
--- a/model_zoo/research/cv/FaceAttribute/scripts/run_eval_gpu.sh
+++ b/model_zoo/research/cv/FaceAttribute/scripts/run_eval_gpu.sh
@ -0,0 +1,46 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 3 ]
+then
+    echo "Usage: sh run_eval.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]"
+exit 1
+fi
+
+current_exec_path=$(pwd)
+echo ${current_exec_path}
+
+dirname_path=$(dirname "$(pwd)")
+echo ${dirname_path}
+
+export PYTHONPATH=${dirname_path}:$PYTHONPATH
+export CUDA_VISIBLE_DEVICES=$2
+export RANK_SIZE=1
+
+SCRIPT_NAME='eval.py'
+
+ulimit -c unlimited
+echo 'start evaluating'
+export RANK_ID=0
+rm -rf eval_gpu
+mkdir eval_gpu
+cd eval_gpu
+
+python ${dirname_path}/${SCRIPT_NAME} \
+    --mindrecord_path=$1 \
+    --device_target="GPU" \
+    --model_path=$3 > eval.log  2>&1 &
+echo 'running'
--- a/model_zoo/research/cv/FaceAttribute/scripts/run_standalone_train_gpu.sh
+++ b/model_zoo/research/cv/FaceAttribute/scripts/run_standalone_train_gpu.sh
@ -0,0 +1,57 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+if [ $# != 2 ] && [ $# != 3 ]
+then
+    echo "Usage: sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES] [PRETRAINED_BACKBONE]"
+    echo "   or: sh run_standalone_train_gpu.sh [MINDRECORD_FILE] [CUDA_VISIBLE_DEVICES]"
+exit 1
+fi
+
+current_exec_path=$(pwd)
+echo ${current_exec_path}
+
+dirname_path=$(dirname "$(pwd)")
+echo ${dirname_path}
+
+export PYTHONPATH=${dirname_path}:$PYTHONPATH
+export RANK_SIZE=1
+export CUDA_VISIBLE_DEVICES=$2
+
+SCRIPT_NAME='train.py'
+
+ulimit -c unlimited
+
+echo 'start training'
+export RANK_ID=0
+rm -rf train_alone_gpu
+mkdir train_alone_gpu
+cd train_alone_gpu
+
+if [ $# == 2 ]
+then
+  python ${dirname_path}/${SCRIPT_NAME} \
+      --world_size=1 \
+      --device_target='GPU' \
+      --mindrecord_path=$1 > train.log  2>&1 &
+else
+  python ${dirname_path}/${SCRIPT_NAME} \
+      --world_size=1 \
+      --device_target='GPU' \
+      --mindrecord_path=$1 \
+      --pretrained=$3 > train.log  2>&1 &
+fi
+echo 'running'
--- a/model_zoo/research/cv/FaceAttribute/train.py
+++ b/model_zoo/research/cv/FaceAttribute/train.py
@ -16,7 +16,6 @@
 import os
 import time
 import datetime
-
 import mindspore
 import mindspore.nn as nn
 from mindspore import context
@ -29,13 +28,11 @@ from mindspore.train.callback import ModelCheckpoint, RunContext, CheckpointConf
 from mindspore.train.serialization import load_checkpoint, load_param_into_net
 from mindspore.ops import operations as P
 from mindspore.common import dtype as mstype
-
 from src.FaceAttribute.resnet18 import get_resnet18
 from src.FaceAttribute.loss_factory import get_loss
 from src.dataset_train import data_generator
 from src.lrsche_factory import warmup_step
 from src.logging import get_logger, AverageMeter
-
 from model_utils.config import config
 from model_utils.moxing_adapter import moxing_wrapper
 from model_utils.device_adapter import get_device_id, get_device_num
@ -50,8 +47,10 @@ class InternalCallbackParam(dict):
    def __setattr__(self, _key, _value):
        self[_key] = _value

+
 class BuildTrainNetwork(nn.Cell):
    '''Build train network.'''
+
    def __init__(self, my_network, my_criterion):
        super(BuildTrainNetwork, self).__init__()
        self.network = my_network
@ -66,6 +65,7 @@ class BuildTrainNetwork(nn.Cell):

 def modelarts_pre_process():
    '''modelarts pre process function.'''
+
    def unzip(zip_file, save_dir):
        import zipfile
        s_time = time.time()
@ -121,7 +121,8 @@ def modelarts_pre_process():
@moxing_wrapper(pre_process=modelarts_pre_process)
 def run_train():
    '''run train.'''
-    context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=False, device_id=get_device_id())
+    context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target, save_graphs=False,
+                        device_id=get_device_id())
    mindspore.set_seed(1)

    # init distributed
@ -241,5 +242,6 @@ def run_train():

    config.logger.info('--------- trains out ---------')

+
 if __name__ == "__main__":
    run_train()