add model zoo script of wide and deep for gpu

2020-06-18 14:24:04 +08:00 · 2020-06-18 14:24:04 +08:00 · 28492e50ae
parent b106c2204a
commit 28492e50ae
4 changed files with 42 additions and 5 deletions
--- a/model_zoo/wide_and_deep/script/run_multigpu_train.sh
+++ b/model_zoo/wide_and_deep/script/run_multigpu_train.sh
@ -0,0 +1,28 @@
+#!/bin/bash
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+# bash run_multigpu_train.sh
+script_self=$(readlink -f "$0")
+self_path=$(dirname "${script_self}")
+RANK_SIZE=$1
+EPOCH_SIZE=$2
+DATASET=$3
+
+mpirun --allow-run-as-root -n $RANK_SIZE                    \
+    python -s ${self_path}/../train_and_eval_distribute.py  \
+        --device_target="GPU"                               \
+        --data_path=$DATASET                                \
+        --epochs=$EPOCH_SIZE > log.txt 2>&1 &
--- a/model_zoo/wide_and_deep/script/run_multinpu_train.sh
+++ b/model_zoo/wide_and_deep/script/run_multinpu_train.sh
@ -31,5 +31,5 @@ do
  cd ${execute_path}/device_$i/ || exit
  export RANK_ID=$i
  export DEVICE_ID=$i
-  python -s ${self_path}/../train_and_eval_multinpu.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 &
+  python -s ${self_path}/../train_and_eval_distribute.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 &
 done
--- a/model_zoo/wide_and_deep/src/config.py
+++ b/model_zoo/wide_and_deep/src/config.py
@ -20,6 +20,8 @@ def argparse_init():
    argparse_init
    """
    parser = argparse.ArgumentParser(description='WideDeep')
+    parser.add_argument("--device_target", type=str, default="Ascend", choices=["Ascend", "GPU"],
+                        help="device where the code will be implemented. (Default: Ascend)")
    parser.add_argument("--data_path", type=str, default="./test_raw_data/")
    parser.add_argument("--epochs", type=int, default=15)
    parser.add_argument("--full_batch", type=bool, default=False)
@ -44,6 +46,7 @@ class WideDeepConfig():
    WideDeepConfig
    """
    def __init__(self):
+        self.device_target = "Ascend"
        self.data_path = "./test_raw_data/"
        self.full_batch = False
        self.epochs = 15
@ -72,6 +75,7 @@ class WideDeepConfig():
        """
        parser = argparse_init()
        args, _ = parser.parse_known_args()
+        self.device_target = args.device_target
        self.data_path = args.data_path
        self.epochs = args.epochs
        self.full_batch = args.full_batch
--- a/model_zoo/wide_and_deep/train_and_eval_distribute.py
+++ b/model_zoo/wide_and_deep/train_and_eval_distribute.py
@ -30,10 +30,6 @@ from src.metrics import AUCMetric
 from src.config import WideDeepConfig

 sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
-context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True)
-context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True)
-init()
-


 def get_WideDeep_net(config):
@ -105,4 +101,13 @@ def train_and_eval(config):
 if __name__ == "__main__":
    wide_deep_config = WideDeepConfig()
    wide_deep_config.argparse_init()
+
+    context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True)
+    if wide_deep_config.device_target == "Ascend":
+        init("hccl")
+    elif wide_deep_config.device_target == "GPU":
+        init("nccl")
+    context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True,
+                                      device_num=get_group_size())
+
    train_and_eval(wide_deep_config)