From 71ffd22a0250ca57257c4597f7910d467c0433a4 Mon Sep 17 00:00:00 2001 From: lizhenyu Date: Wed, 24 Jun 2020 11:49:56 +0800 Subject: [PATCH] add wide&deep stanalone training script for gpu in model zoo --- model_zoo/wide_and_deep/README.md | 3 +++ model_zoo/wide_and_deep/eval.py | 7 ++--- .../script/run_multigpu_train.sh | 3 ++- .../script/run_standalone_train_for_gpu.sh | 27 +++++++++++++++++++ model_zoo/wide_and_deep/train.py | 9 ++++--- model_zoo/wide_and_deep/train_and_eval.py | 11 +++++--- .../train_and_eval_auto_parallel.py | 3 +++ .../train_and_eval_distribute.py | 11 ++++++-- 8 files changed, 60 insertions(+), 14 deletions(-) create mode 100644 model_zoo/wide_and_deep/script/run_standalone_train_for_gpu.sh diff --git a/model_zoo/wide_and_deep/README.md b/model_zoo/wide_and_deep/README.md index 54367ef173e..000e6a53353 100644 --- a/model_zoo/wide_and_deep/README.md +++ b/model_zoo/wide_and_deep/README.md @@ -37,6 +37,7 @@ To train and evaluate the model, command as follows: python train_and_eval.py ``` Arguments: + * `--device_target`: Device where the code will be implemented (Default: Ascend). * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. * `--epochs`: Total train epochs. * `--batch_size`: Training batch size. @@ -57,6 +58,7 @@ To train the model in one device, command as follows: python train.py ``` Arguments: + * `--device_target`: Device where the code will be implemented (Default: Ascend). * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. * `--epochs`: Total train epochs. * `--batch_size`: Training batch size. @@ -87,6 +89,7 @@ To evaluate the model, command as follows: python eval.py ``` Arguments: + * `--device_target`: Device where the code will be implemented (Default: Ascend). * `--data_path`: This should be set to the same directory given to the data_download's data_dir argument. * `--epochs`: Total train epochs. * `--batch_size`: Training batch size. diff --git a/model_zoo/wide_and_deep/eval.py b/model_zoo/wide_and_deep/eval.py index 72c30b1f5c4..bc3846533f2 100644 --- a/model_zoo/wide_and_deep/eval.py +++ b/model_zoo/wide_and_deep/eval.py @@ -26,11 +26,11 @@ from src.datasets import create_dataset from src.metrics import AUCMetric from src.config import WideDeepConfig -context.set_context(mode=context.GRAPH_MODE, device_target="Davinci", - save_graphs=True) - def get_WideDeep_net(config): + """ + Get network of wide&deep model. + """ WideDeep_net = WideDeepModel(config) loss_net = NetWithLossClass(WideDeep_net, config) @@ -91,4 +91,5 @@ if __name__ == "__main__": widedeep_config = WideDeepConfig() widedeep_config.argparse_init() + context.set_context(mode=context.GRAPH_MODE, device_target=widedeep_config.device_target) test_eval(widedeep_config) diff --git a/model_zoo/wide_and_deep/script/run_multigpu_train.sh b/model_zoo/wide_and_deep/script/run_multigpu_train.sh index 987eeaa65e5..e0e08ab80ed 100644 --- a/model_zoo/wide_and_deep/script/run_multigpu_train.sh +++ b/model_zoo/wide_and_deep/script/run_multigpu_train.sh @@ -14,7 +14,7 @@ # limitations under the License. # ============================================================================ -# bash run_multigpu_train.sh +# bash run_multigpu_train.sh RANK_SIZE EPOCH_SIZE DATASET script_self=$(readlink -f "$0") self_path=$(dirname "${script_self}") RANK_SIZE=$1 @@ -25,4 +25,5 @@ mpirun --allow-run-as-root -n $RANK_SIZE \ python -s ${self_path}/../train_and_eval_distribute.py \ --device_target="GPU" \ --data_path=$DATASET \ + --batch_size=8000 \ --epochs=$EPOCH_SIZE > log.txt 2>&1 & diff --git a/model_zoo/wide_and_deep/script/run_standalone_train_for_gpu.sh b/model_zoo/wide_and_deep/script/run_standalone_train_for_gpu.sh new file mode 100644 index 00000000000..693c62b8478 --- /dev/null +++ b/model_zoo/wide_and_deep/script/run_standalone_train_for_gpu.sh @@ -0,0 +1,27 @@ +#!/bin/bash +# Copyright 2020 Huawei Technologies Co., Ltd +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. +# ============================================================================ + +# bash run_standalone_train_for_gpu.sh EPOCH_SIZE DATASET +script_self=$(readlink -f "$0") +self_path=$(dirname "${script_self}") +EPOCH_SIZE=$1 +DATASET=$2 + +python -s ${self_path}/../train_and_eval.py \ + --device_target="GPU" \ + --data_path=$DATASET \ + --batch_size=16000 \ + --epochs=$EPOCH_SIZE > log.txt 2>&1 & diff --git a/model_zoo/wide_and_deep/train.py b/model_zoo/wide_and_deep/train.py index ac9750c547c..a043be3dc6d 100644 --- a/model_zoo/wide_and_deep/train.py +++ b/model_zoo/wide_and_deep/train.py @@ -15,16 +15,16 @@ import os from mindspore import Model, context from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor - from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.callbacks import LossCallBack from src.datasets import create_dataset from src.config import WideDeepConfig -context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True) - def get_WideDeep_net(configure): + """ + Get network of wide&deep model. + """ WideDeep_net = WideDeepModel(configure) loss_net = NetWithLossClass(WideDeep_net, configure) @@ -72,7 +72,7 @@ def test_train(configure): model = Model(train_net) callback = LossCallBack(config=configure) - ckptconfig = CheckpointConfig(save_checkpoint_steps=1, + ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig) model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb]) @@ -82,4 +82,5 @@ if __name__ == "__main__": config = WideDeepConfig() config.argparse_init() + context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target) test_train(config) diff --git a/model_zoo/wide_and_deep/train_and_eval.py b/model_zoo/wide_and_deep/train_and_eval.py index 0b37b67a11c..e0ab6b2e9e7 100644 --- a/model_zoo/wide_and_deep/train_and_eval.py +++ b/model_zoo/wide_and_deep/train_and_eval.py @@ -15,7 +15,7 @@ import os from mindspore import Model, context -from mindspore.train.callback import ModelCheckpoint, CheckpointConfig +from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel from src.callbacks import LossCallBack, EvalCallBack @@ -23,10 +23,11 @@ from src.datasets import create_dataset from src.metrics import AUCMetric from src.config import WideDeepConfig -context.set_context(mode=context.GRAPH_MODE, device_target="Davinci") - def get_WideDeep_net(config): + """ + Get network of wide&deep model. + """ WideDeep_net = WideDeepModel(config) loss_net = NetWithLossClass(WideDeep_net, config) @@ -87,11 +88,13 @@ def test_train_eval(config): out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) - model.train(epochs, ds_train, callbacks=[eval_callback, callback, ckpoint_cb]) + model.train(epochs, ds_train, + callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb]) if __name__ == "__main__": wide_deep_config = WideDeepConfig() wide_deep_config.argparse_init() + context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target) test_train_eval(wide_deep_config) diff --git a/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py b/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py index 780c95540c0..4c86931b2e7 100644 --- a/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py +++ b/model_zoo/wide_and_deep/train_and_eval_auto_parallel.py @@ -40,6 +40,9 @@ init() def get_WideDeep_net(config): + """ + Get network of wide&deep model. + """ WideDeep_net = WideDeepModel(config) loss_net = NetWithLossClass(WideDeep_net, config) loss_net = VirtualDatasetCellTriple(loss_net) diff --git a/model_zoo/wide_and_deep/train_and_eval_distribute.py b/model_zoo/wide_and_deep/train_and_eval_distribute.py index db98bacfecb..71f2b11cba5 100644 --- a/model_zoo/wide_and_deep/train_and_eval_distribute.py +++ b/model_zoo/wide_and_deep/train_and_eval_distribute.py @@ -33,6 +33,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__)))) def get_WideDeep_net(config): + """ + Get network of wide&deep model. + """ WideDeep_net = WideDeepModel(config) loss_net = NetWithLossClass(WideDeep_net, config) train_net = TrainStepWrap(loss_net) @@ -90,8 +93,12 @@ def train_and_eval(config): callback = LossCallBack(config=config) ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5) - ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', - directory=config.ckpt_path, config=ckptconfig) + if config.device_target == "Ascend": + ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', + directory=config.ckpt_path, config=ckptconfig) + elif config.device_target == "GPU": + ckpoint_cb = ModelCheckpoint(prefix='widedeep_train_' + str(get_rank()), + directory=config.ckpt_path, config=ckptconfig) out = model.eval(ds_eval) print("=====" * 5 + "model.eval() initialized: {}".format(out)) model.train(epochs, ds_train,