fcn-4 gpu compatibility

2021-08-10 09:23:27 +08:00 · 2021-08-10 09:23:27 +08:00 · 2e00d43f45
parent 1b91c025fe
commit 2e00d43f45
7 changed files with 101 additions and 22 deletions
--- a/model_zoo/research/audio/fcn-4/README.md
+++ b/model_zoo/research/audio/fcn-4/README.md
@ -188,6 +188,8 @@ SLOG_PRINT_TO_STDOUT=1 python eval.py --device_id 0
        │   ├──run_train.sh             // shell script for distributed on Ascend
        │   ├──run_eval.sh              // shell script for evaluation on Ascend
        │   ├──run_process_data.sh      // shell script for convert audio clips to mindrecord
+        │   ├──run_train_gpu.sh         // shell script for distributed on GPU
+        │   ├──run_eval_gpu.sh          // shell script for evaluation on GPU
        ├── src
        │   ├──dataset.py                     // creating dataset
        │   ├──pre_process_data.py            // pre-process dataset
@ -253,7 +255,13 @@ Parameters for both training and evaluation can be set in default_config.yaml
 - running on Ascend

  ```shell
-  python train.py > train.log 2>&1 &
+  python train.py --device_target Ascend > train.log 2>&1 &
+  ```
+
+- running on GPU
+
+  ```shell
+  python train.py --device_target GPU --data_dir [dataset dir path]  --checkpoint_path [chekpoint save dir]  > train.log 2>&1 &
  ```

  The python command above will run in the background, you can view the results through the file `train.log`.
@ -310,21 +318,21 @@ AUC: 0.90995

 #### Evaluation Performance

-| Parameters                 | Ascend                                                      |
-| -------------------------- | ----------------------------------------------------------- |
-| Model Version              | FCN-4                                                       |
-| Resource                   | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8            |
-| uploaded Date              | 07/05/2021 (month/day/year)                                 |
-| MindSpore Version          | 1.3.0                                                |
-| Training Parameters        | epoch=10, steps=534, batch_size = 32, lr=0.005              |
-| Optimizer                  | Adam                                                        |
-| Loss Function              | Binary cross entropy                                        |
-| outputs                    | probability                                                 |
-| Loss                       | AUC 0.909                                                  |
-| Speed                      | 1pc: 160 samples/sec;                                       |
-| Total time                 | 1pc: 20 mins;                                               |
-| Checkpoint for Fine tuning | 198.73M(.ckpt file)                                         |
-| Scripts                    | [music_auto_tagging script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/audio/fcn-4) |
+| Parameters                 | Ascend                                                      | GPU                                                         |
+| -------------------------- | ----------------------------------------------------------- | ----------------------------------------------------------- |
+| Model Version              | FCN-4                                                       | FCN-4                                                       |
+| Resource                   | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8  | Tesla V100-PICE-32G                                         |
+| uploaded Date              | 07/05/2021 (month/day/year)                                 | 07/26/2021 (month/day/year)                                 |
+| MindSpore Version          | 1.3.0                                                       | 1.3.0                                                       |
+| Training Parameters        | epoch=10, steps=534, batch_size = 32, lr=0.005              | epoch=10, steps=534, batch_size = 32, lr=0.005              |
+| Optimizer                  | Adam                                                        | Adam                                                        |
+| Loss Function              | Binary cross entropy                                        | Binary cross entropy                                        |
+| outputs                    | probability                                                 | probability                                                 |
+| Loss                       | AUC 0.909                                                   | AUC 0.909                                                   |
+| Speed                      | 1pc: 160 samples/sec;                                       | 1pc: 160 samples/sec;                                       |
+| Total time                 | 1pc: 20 mins;                                               | 1pc: 20 mins;                                               |
+| Checkpoint for Fine tuning | 198.73M(.ckpt file)                                         | 198.73M(.ckpt file)                                         |
+| Scripts                    | [music_auto_tagging script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/audio/fcn-4)             |

 ## [ModelZoo Homepage](#contents)  

--- a/model_zoo/research/audio/fcn-4/default_config.yaml
+++ b/model_zoo/research/audio/fcn-4/default_config.yaml
@ -6,7 +6,7 @@ checkpoint_url: ""
 data_path: "/cache/data"
 output_path: "/cache/train"
 load_path: "/cache/checkpoint_path"
-device_target: Ascend
+device_target: "Ascend"
 enable_profiling: False

 # ==============================================================================
--- a/model_zoo/research/audio/fcn-4/eval.py
+++ b/model_zoo/research/audio/fcn-4/eval.py
@ -18,13 +18,11 @@ python eval.py
 '''

 import numpy as np
-
 from src.model_utils.config import config
 from src.model_utils.moxing_adapter import moxing_wrapper
 from src.model_utils.device_adapter import get_device_id
 from src.musictagger import MusicTaggerCNN
 from src.dataset import create_dataset
-
 import mindspore.common.dtype as mstype
 from mindspore import context
 from mindspore import Tensor
@ -113,12 +111,15 @@ def validation(net, model_path, data_dir, filename, num_consumer, batch):
 def modelarts_process():
    pass

+
@moxing_wrapper(pre_process=modelarts_process)
 def fcn4_eval():
    """
    eval network
    """
-    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE, device_id=get_device_id())
+    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE)
+    if config.device_target == 'Ascend':
+        context.set_context(device_id=get_device_id())

    network = MusicTaggerCNN(in_classes=[1, 128, 384, 768, 2048],
                             kernel_size=[3, 3, 3, 3, 3],
--- a/model_zoo/research/audio/fcn-4/scripts/run_eval_gpu.sh
+++ b/model_zoo/research/audio/fcn-4/scripts/run_eval_gpu.sh
@ -0,0 +1,27 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+echo "run as sh run_train_gpu.sh [CUDA_VISIBLE_DEVICES] [DATA_PATH] [CKPT_PATH]"
+echo "for example sh run_train_gpu.sh 0 /home/dataset/Music-Tagging /home/fcn-4/"
+
+export CUDA_VISIBLE_DEVICES=$1
+DATA_PATH=$2
+CKPT_PATH=$3
+export SLOG_PRINT_TO_STDOUT=1
+
+rm -rf eval_gpu
+mkdir eval_gpu
+
+python ../eval.py --data_dir=$DATA_PATH --checkpoint_path=$CKPT_PATH --device_target=GPU > eval_gpu/eval.log 2>&1 &
--- a/model_zoo/research/audio/fcn-4/scripts/run_train_gpu.sh
+++ b/model_zoo/research/audio/fcn-4/scripts/run_train_gpu.sh
@ -0,0 +1,37 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+echo "run as sh run_train_gpu.sh [CUDA_VISIBLE_DEVICES] [DATA_PATH] [CKPT_PATH(options)]"
+echo "for example sh run_train_gpu.sh 0 /home/dataset/Music-Tagging /home/fcn-4/(options)"
+
+export CUDA_VISIBLE_DEVICES=$1
+DATA_PATH=$2
+CKPT_PATH="./"
+PRE_TRAINED=False
+export SLOG_PRINT_TO_STDOUT=1
+
+if [ $# == 3 ]
+then
+  CKPT_PATH=$3
+  PRE_TRAINED=True
+fi
+
+rm -rf train_gpu
+mkdir train_gpu
+
+echo "start training"
+python ../train.py --data_dir=$DATA_PATH --checkpoint_path=$CKPT_PATH \
+                    --pre_trained=$PRE_TRAINED \
+                    --device_target=GPU > train_gpu/train.log 2>&1 &
--- a/model_zoo/research/audio/fcn-4/src/model_utils/config.py
+++ b/model_zoo/research/audio/fcn-4/src/model_utils/config.py
@ -124,4 +124,5 @@ def get_config():
    final_config = merge(args, default)
    return Config(final_config)

+
 config = get_config()
--- a/model_zoo/research/audio/fcn-4/train.py
+++ b/model_zoo/research/audio/fcn-4/train.py
@ -16,7 +16,7 @@
 ##############train models#################
 python train.py
 '''
-
+import os
 from mindspore import context, nn
 from mindspore.train import Model
 from mindspore.common import set_seed
@ -35,6 +35,7 @@ from src.loss import BCELoss
 def modelarts_pre_process():
    pass

+
@moxing_wrapper(pre_process=modelarts_pre_process)
 def train(model, dataset_direct, filename, columns_list, num_consumer=4,
          batch=16, epoch=50, save_checkpoint_steps=2172, keep_checkpoint_max=50,
@ -58,8 +59,12 @@ def train(model, dataset_direct, filename, columns_list, num_consumer=4,
 if __name__ == "__main__":
    set_seed(1)

-    context.set_context(device_target='Ascend', mode=context.GRAPH_MODE, device_id=get_device_id())
+    config.checkpoint_path = os.path.abspath(config.checkpoint_path)
+    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE)
    context.set_context(enable_auto_mixed_precision=config.mixed_precision)
+    if config.device_target == 'Ascend':
+        context.set_context(device_id=get_device_id())
+
    network = MusicTaggerCNN(in_classes=[1, 128, 384, 768, 2048],
                             kernel_size=[3, 3, 3, 3, 3],
                             padding=[0] * 5,