fcn-4 gpu compatibility

2021-08-10 09:23:27 +08:00 · 2021-08-10 09:23:27 +08:00 · 2e00d43f45
parent 1b91c025fe
commit 2e00d43f45
7 changed files with 101 additions and 22 deletions
--- a/model_zoo/research/audio/fcn-4/README.md
+++ b/model_zoo/research/audio/fcn-4/README.md
@ -188,6 +188,8 @@ SLOG_PRINT_TO_STDOUT=1 python eval.py --device_id 0
        │   ├──run_train.sh             // shell script for distributed on Ascend
        │   ├──run_eval.sh              // shell script for evaluation on Ascend
        │   ├──run_process_data.sh      // shell script for convert audio clips to mindrecord
        │   ├──run_train_gpu.sh         // shell script for distributed on GPU
        │   ├──run_eval_gpu.sh          // shell script for evaluation on GPU
        ├── src
        │   ├──dataset.py                     // creating dataset
        │   ├──pre_process_data.py            // pre-process dataset
@ -253,7 +255,13 @@ Parameters for both training and evaluation can be set in default_config.yaml
 - running on Ascend
  ```shell
-  python train.py > train.log 2>&1 &
+  python train.py --device_target Ascend > train.log 2>&1 &
  ```
 - running on GPU
  ```shell
  python train.py --device_target GPU --data_dir [dataset dir path]  --checkpoint_path [chekpoint save dir]  > train.log 2>&1 &
  ```
  The python command above will run in the background, you can view the results through the file `train.log`.
@ -310,20 +318,20 @@ AUC: 0.90995
 #### Evaluation Performance
-| Parameters                 | Ascend                                                      |
+| Parameters                 | Ascend                                                      | GPU                                                         |
-| -------------------------- | ----------------------------------------------------------- |
+| -------------------------- | ----------------------------------------------------------- | ----------------------------------------------------------- |
-| Model Version              | FCN-4                                                       |
+| Model Version              | FCN-4                                                       | FCN-4                                                       |
-| Resource                   | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8            |
+| Resource                   | Ascend 910; CPU 2.60GHz, 56cores; Memory 314G; OS Euler2.8  | Tesla V100-PICE-32G                                         |
-| uploaded Date              | 07/05/2021 (month/day/year)                                 |
+| uploaded Date              | 07/05/2021 (month/day/year)                                 | 07/26/2021 (month/day/year)                                 |
-| MindSpore Version          | 1.3.0                                                |
+| MindSpore Version          | 1.3.0                                                       | 1.3.0                                                       |
-| Training Parameters        | epoch=10, steps=534, batch_size = 32, lr=0.005              |
+| Training Parameters        | epoch=10, steps=534, batch_size = 32, lr=0.005              | epoch=10, steps=534, batch_size = 32, lr=0.005              |
-| Optimizer                  | Adam                                                        |
+| Optimizer                  | Adam                                                        | Adam                                                        |
-| Loss Function              | Binary cross entropy                                        |
+| Loss Function              | Binary cross entropy                                        | Binary cross entropy                                        |
-| outputs                    | probability                                                 |
+| outputs                    | probability                                                 | probability                                                 |
-| Loss                       | AUC 0.909                                                  |
+| Loss                       | AUC 0.909                                                   | AUC 0.909                                                   |
-| Speed                      | 1pc: 160 samples/sec;                                       |
+| Speed                      | 1pc: 160 samples/sec;                                       | 1pc: 160 samples/sec;                                       |
-| Total time                 | 1pc: 20 mins;                                               |
+| Total time                 | 1pc: 20 mins;                                               | 1pc: 20 mins;                                               |
-| Checkpoint for Fine tuning | 198.73M(.ckpt file)                                         |
+| Checkpoint for Fine tuning | 198.73M(.ckpt file)                                         | 198.73M(.ckpt file)                                         |
 | Scripts                    | [music_auto_tagging script](https://gitee.com/mindspore/mindspore/tree/master/model_zoo/research/audio/fcn-4)             |
 ## [ModelZoo Homepage](#contents)  
--- a/model_zoo/research/audio/fcn-4/default_config.yaml
+++ b/model_zoo/research/audio/fcn-4/default_config.yaml
@ -6,7 +6,7 @@ checkpoint_url: ""
 data_path: "/cache/data"
 output_path: "/cache/train"
 load_path: "/cache/checkpoint_path"
-device_target: Ascend
+device_target: "Ascend"
 enable_profiling: False
 # ==============================================================================
--- a/model_zoo/research/audio/fcn-4/eval.py
+++ b/model_zoo/research/audio/fcn-4/eval.py
@ -18,13 +18,11 @@ python eval.py
 '''
 import numpy as np
 from src.model_utils.config import config
 from src.model_utils.moxing_adapter import moxing_wrapper
 from src.model_utils.device_adapter import get_device_id
 from src.musictagger import MusicTaggerCNN
 from src.dataset import create_dataset
 import mindspore.common.dtype as mstype
 from mindspore import context
 from mindspore import Tensor
@ -113,12 +111,15 @@ def validation(net, model_path, data_dir, filename, num_consumer, batch):
 def modelarts_process():
    pass
@moxing_wrapper(pre_process=modelarts_process)
 def fcn4_eval():
    """
    eval network
    """
-    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE, device_id=get_device_id())
+    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE)
    if config.device_target == 'Ascend':
        context.set_context(device_id=get_device_id())
    network = MusicTaggerCNN(in_classes=[1, 128, 384, 768, 2048],
                             kernel_size=[3, 3, 3, 3, 3],
--- a/model_zoo/research/audio/fcn-4/scripts/run_eval_gpu.sh
+++ b/model_zoo/research/audio/fcn-4/scripts/run_eval_gpu.sh
@ -0,0 +1,27 @@
 #!/bin/bash
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 echo "run as sh run_train_gpu.sh [CUDA_VISIBLE_DEVICES] [DATA_PATH] [CKPT_PATH]"
 echo "for example sh run_train_gpu.sh 0 /home/dataset/Music-Tagging /home/fcn-4/"
 export CUDA_VISIBLE_DEVICES=$1
 DATA_PATH=$2
 CKPT_PATH=$3
 export SLOG_PRINT_TO_STDOUT=1
 rm -rf eval_gpu
 mkdir eval_gpu
 python ../eval.py --data_dir=$DATA_PATH --checkpoint_path=$CKPT_PATH --device_target=GPU > eval_gpu/eval.log 2>&1 &
--- a/model_zoo/research/audio/fcn-4/scripts/run_train_gpu.sh
+++ b/model_zoo/research/audio/fcn-4/scripts/run_train_gpu.sh
@ -0,0 +1,37 @@
 #!/bin/bash
 # Copyright 2021 Huawei Technologies Co., Ltd
 #
 # Licensed under the Apache License, Version 2.0 (the "License");
 # you may not use this file except in compliance with the License.
 # You may obtain a copy of the License at
 #
 # http://www.apache.org/licenses/LICENSE-2.0
 #
 # Unless required by applicable law or agreed to in writing, software
 # distributed under the License is distributed on an "AS IS" BASIS,
 # WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
 # See the License for the specific language governing permissions and
 # limitations under the License.
 # ============================================================================
 echo "run as sh run_train_gpu.sh [CUDA_VISIBLE_DEVICES] [DATA_PATH] [CKPT_PATH(options)]"
 echo "for example sh run_train_gpu.sh 0 /home/dataset/Music-Tagging /home/fcn-4/(options)"
 export CUDA_VISIBLE_DEVICES=$1
 DATA_PATH=$2
 CKPT_PATH="./"
 PRE_TRAINED=False
 export SLOG_PRINT_TO_STDOUT=1
 if [ $# == 3 ]
 then
  CKPT_PATH=$3
  PRE_TRAINED=True
 fi
 rm -rf train_gpu
 mkdir train_gpu
 echo "start training"
 python ../train.py --data_dir=$DATA_PATH --checkpoint_path=$CKPT_PATH \
                    --pre_trained=$PRE_TRAINED \
                    --device_target=GPU > train_gpu/train.log 2>&1 &
--- a/model_zoo/research/audio/fcn-4/src/model_utils/config.py
+++ b/model_zoo/research/audio/fcn-4/src/model_utils/config.py
@ -124,4 +124,5 @@ def get_config():
    final_config = merge(args, default)
    return Config(final_config)
 config = get_config()
--- a/model_zoo/research/audio/fcn-4/train.py
+++ b/model_zoo/research/audio/fcn-4/train.py
@ -16,7 +16,7 @@
 ##############train models#################
 python train.py
 '''
-
+import os
 from mindspore import context, nn
 from mindspore.train import Model
 from mindspore.common import set_seed
@ -35,6 +35,7 @@ from src.loss import BCELoss
 def modelarts_pre_process():
    pass
@moxing_wrapper(pre_process=modelarts_pre_process)
 def train(model, dataset_direct, filename, columns_list, num_consumer=4,
          batch=16, epoch=50, save_checkpoint_steps=2172, keep_checkpoint_max=50,
@ -58,8 +59,12 @@ def train(model, dataset_direct, filename, columns_list, num_consumer=4,
 if __name__ == "__main__":
    set_seed(1)
-    context.set_context(device_target='Ascend', mode=context.GRAPH_MODE, device_id=get_device_id())
+    config.checkpoint_path = os.path.abspath(config.checkpoint_path)
    context.set_context(device_target=config.device_target, mode=context.GRAPH_MODE)
    context.set_context(enable_auto_mixed_precision=config.mixed_precision)
    if config.device_target == 'Ascend':
        context.set_context(device_id=get_device_id())
    network = MusicTaggerCNN(in_classes=[1, 128, 384, 768, 2048],
                             kernel_size=[3, 3, 3, 3, 3],
                             padding=[0] * 5,