From 28492e50aeca96d1327e84a8c82d9eace44e12fa Mon Sep 17 00:00:00 2001 From: lizhenyu Date: Thu, 18 Jun 2020 14:24:04 +0800 Subject: [PATCH] add model zoo script of wide and deep for gpu --- .../script/run_multigpu_train.sh | 28 +++++++++++++++++++ .../script/run_multinpu_train.sh | 2 +- model_zoo/wide_and_deep/src/config.py | 4 +++ ...ltinpu.py => train_and_eval_distribute.py} | 13 ++++++--- 4 files changed, 42 insertions(+), 5 deletions(-) create mode 100644 model_zoo/wide_and_deep/script/run_multigpu_train.sh rename model_zoo/wide_and_deep/{train_and_eval_multinpu.py => train_and_eval_distribute.py} (89%) diff --git a/model_zoo/wide_and_deep/script/run_multigpu_train.sh b/model_zoo/wide_and_deep/script/run_multigpu_train.sh new file mode 100644 index 0000000000..987eeaa65e --- /dev/null +++ b/model_zoo/wide_and_deep/script/run_multigpu_train.sh @@ -0,0 +1,28 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# bash run_multigpu_train.sh +script_self=$(readlink -f "$0") +self_path=$(dirname "${script_self}") +RANK_SIZE=$1 +EPOCH_SIZE=$2 +DATASET=$3 + +mpirun --allow-run-as-root -n $RANK_SIZE \ + python -s ${self_path}/../train_and_eval_distribute.py \ + --device_target="GPU" \ + --data_path=$DATASET \ + --epochs=$EPOCH_SIZE > log.txt 2>&1 & diff --git a/model_zoo/wide_and_deep/script/run_multinpu_train.sh b/model_zoo/wide_and_deep/script/run_multinpu_train.sh index c05156ff7e..4b642bc196 100644 --- a/model_zoo/wide_and_deep/script/run_multinpu_train.sh +++ b/model_zoo/wide_and_deep/script/run_multinpu_train.sh @@ -31,5 +31,5 @@ do cd ${execute_path}/device_$i/ || exit export RANK_ID=$i export DEVICE_ID=$i - python -s ${self_path}/../train_and_eval_multinpu.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 & + python -s ${self_path}/../train_and_eval_distribute.py --data_path=$DATASET --epochs=$EPOCH_SIZE >train_deep$i.log 2>&1 & done diff --git a/model_zoo/wide_and_deep/src/config.py b/model_zoo/wide_and_deep/src/config.py index f3488287af..c822b8e76d 100644 --- a/model_zoo/wide_and_deep/src/config.py +++ b/model_zoo/wide_and_deep/src/config.py @@ -20,6 +20,8 @@ def argparse_init(): argparse_init """ parser = argparse.ArgumentParser(description='WideDeep') + parser.add_argument("--device_target", type=str, default="Ascend", choices=["Ascend", "GPU"], + help="device where the code will be implemented. (Default: Ascend)") parser.add_argument("--data_path", type=str, default="./test_raw_data/") parser.add_argument("--epochs", type=int, default=15) parser.add_argument("--full_batch", type=bool, default=False) @@ -44,6 +46,7 @@ class WideDeepConfig(): WideDeepConfig """ def __init__(self): + self.device_target = "Ascend" self.data_path = "./test_raw_data/" self.full_batch = False self.epochs = 15 @@ -72,6 +75,7 @@ class WideDeepConfig(): """ parser = argparse_init() args, _ = parser.parse_known_args() + self.device_target = args.device_target self.data_path = args.data_path self.epochs = args.epochs self.full_batch = args.full_batch diff --git a/model_zoo/wide_and_deep/train_and_eval_multinpu.py b/model_zoo/wide_and_deep/train_and_eval_distribute.py similarity index 89% rename from model_zoo/wide_and_deep/train_and_eval_multinpu.py rename to model_zoo/wide_and_deep/train_and_eval_distribute.py index 37ade572bf..db98bacfec 100644 --- a/model_zoo/wide_and_deep/train_and_eval_multinpu.py +++ b/model_zoo/wide_and_deep/train_and_eval_distribute.py @@ -30,10 +30,6 @@ from src.metrics import AUCMetric from src.config import WideDeepConfig sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True) -context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True) -init() - def get_WideDeep_net(config): @@ -105,4 +101,13 @@ def train_and_eval(config): if __name__ == "__main__": wide_deep_config = WideDeepConfig() wide_deep_config.argparse_init() + + context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target, save_graphs=True) + if wide_deep_config.device_target == "Ascend": + init("hccl") + elif wide_deep_config.device_target == "GPU": + init("nccl") + context.set_auto_parallel_context(parallel_mode=ParallelMode.DATA_PARALLEL, mirror_mean=True, + device_num=get_group_size()) + train_and_eval(wide_deep_config)