!19527 cnn_direction_model gpu version

Merge pull request !19527 from panfengfeng/add_cnn_direction_model_gpu_version
2021-07-09 07:14:49 +00:00 · 2021-07-09 07:14:49 +00:00 · 17c9b7397d
parent f2d2f3dd5b e158b8fce1
commit 17c9b7397d
4 changed files with 214 additions and 10 deletions
--- a/model_zoo/official/cv/cnn_direction_model/scripts/run_distribute_train_gpu.sh
+++ b/model_zoo/official/cv/cnn_direction_model/scripts/run_distribute_train_gpu.sh
@ -0,0 +1,84 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+if [ $# != 3 ] && [ $# != 4 ]
+then
+    echo "Usage:
+          sh scripts/run_distribute_train_gpu.sh [DEVICE_NUM] [VISIABLE_DEVICES(0,1,2,3,4,5,6,7)] [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+          "
+    exit 1
+fi
+
+if [ $1 -lt 1 ] && [ $1 -gt 8 ]
+then
+    echo "error: DEVICE_NUM=$1 is not in (1-8)"
+exit 1
+fi
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $3)
+
+if [ $# == 4 ]
+then
+    PATH2=$(get_real_path $4)
+fi
+
+if [ ! -d $PATH1 ]
+then 
+    echo "error: DATASET_PATH=$PATH1 is not a directory"
+exit 1
+fi 
+
+if [ $# == 4 ] && [ ! -f $PATH2 ]
+then
+    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+exit 1
+fi
+
+export DEVICE_NUM=$1
+export RANK_SIZE=$1
+
+if [ -d "./train" ];
+then
+    rm -rf ./train
+fi
+mkdir ./train
+cd ./train || exit
+
+export CUDA_VISIBLE_DEVICES="$2"
+
+if [ $# == 3 ]
+then
+    mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
+    python ../train.py \
+        --run_distribute=True \
+        --train_dataset_path=$PATH1 > train.log 2>&1 &
+fi
+
+if [ $# == 4 ]
+then
+    mpirun -n $1 --allow-run-as-root --output-filename log_output --merge-stderr-to-stdout \
+    python ../train.py \
+        --run_distribute=True \
+        --train_dataset_path=$PATH1 \
+        --pre_trained=$PATH2 > train.log 2>&1 &
+fi
--- a/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_eval_gpu.sh
+++ b/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_eval_gpu.sh
@ -0,0 +1,48 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+if [ $# != 2 ]
+then
+    echo "Usage:
+      sh scripts/run_eval_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH]
+      "
+exit 1
+fi
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $1)
+PATH2=$(get_real_path $2)
+
+if [ ! -f $PATH2 ]
+then
+    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+exit 1
+fi
+
+if [ -d "./eval" ];
+then
+    rm -rf ./eval
+fi
+mkdir ./eval
+cd ./eval || exit
+
+python ../eval.py --eval_dataset_path=$PATH1 --checkpoint_path=$PATH2 > eval.log 2>&1 &
--- a/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_train_gpu.sh
+++ b/model_zoo/official/cv/cnn_direction_model/scripts/run_standalone_train_gpu.sh
@ -0,0 +1,61 @@
+#!/bin/bash
+# Copyright 2021 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+if [ $# != 1 ] && [ $# != 2 ]
+then
+    echo "Usage: 
+          sh scripts/run_standalone_train_gpu.sh [DATASET_PATH] [PRETRAINED_CKPT_PATH](optional)
+          "
+exit 1
+fi
+
+get_real_path(){
+  if [ "${1:0:1}" == "/" ]; then
+    echo "$1"
+  else
+    echo "$(realpath -m $PWD/$1)"
+  fi
+}
+
+PATH1=$(get_real_path $1)
+
+if [ $# == 2 ]
+then
+    PATH2=$(get_real_path $2)
+fi
+
+if [ $# == 2 ] && [ ! -f $PATH2 ]
+then
+    echo "error: PRETRAINED_CKPT_PATH=$PATH2 is not a file"
+exit 1
+fi
+
+if [ -d "./train" ];
+then
+    rm -rf ./train
+fi
+mkdir ./train
+cd ./train || exit
+
+
+if [ $# == 1 ]
+then
+    python ../train.py --train_dataset_path=$PATH1 > train.log 2>&1 &
+fi
+
+if [ $# == 2 ]
+then
+    python ../train.py --train_dataset_path=$PATH1 --pre_trained=$PATH2 > train.log 2>&1 &
+fi
--- a/model_zoo/official/cv/cnn_direction_model/train.py
+++ b/model_zoo/official/cv/cnn_direction_model/train.py
@ -23,7 +23,7 @@ import mindspore as ms
 from mindspore import Tensor
 from mindspore import context
 from mindspore import dataset as de
-from mindspore.communication.management import init
+from mindspore.communication.management import init, get_rank
 from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
 from mindspore.nn.metrics import Accuracy
 from mindspore.nn.optim.adam import Adam
@ -101,21 +101,30 @@ def train():
    target = config.device_target
    ckpt_save_dir = config.save_checkpoint_path

-    # init context
-    device_id = get_device_id()
-    rank_id = get_rank_id()
-    rank_size = get_device_num()
-    run_distribute = rank_size > 1
    context.set_context(mode=context.GRAPH_MODE,
                        device_target=target,
-                        device_id=device_id, save_graphs=False)
+                        save_graphs=False)
+    rank_size = get_device_num()
+    run_distribute = rank_size > 1
+    device_id = get_device_id()

+    if target == "Ascend":
+        # init context
+        rank_id = get_rank_id()
+        context.set_context(device_id=device_id)
+
+        if run_distribute:
+            context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
+            init()
+    elif target == "GPU":
+        rank_id = 0
+        if run_distribute:
+            context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
+            init()
+            rank_id = get_rank()
    print("train args: ", config, "\ncfg: ", config,
          "\nparallel args: rank_id {}, device_id {}, rank_size {}".format(rank_id, device_id, rank_size))

-    if run_distribute:
-        context.set_auto_parallel_context(device_num=rank_size, parallel_mode=ParallelMode.DATA_PARALLEL)
-        init()

    config.rank_save_ckpt_flag = 0
    if config.is_save_on_master:
@ -130,6 +139,8 @@ def train():
                                   ".mindrecord0", config=config, dataset_name=dataset_name)
    step_size = dataset.get_dataset_size()

+    print("step_size ", step_size, flush=True)
+
    # define net
    net = CNNDirectionModel([3, 64, 48, 48, 64], [64, 48, 48, 64, 64], [256, 64], [64, 512])