forked from mindspore-Ecosystem/mindspore
!2549 add wide&deep stanalone training script for gpu in model zoo
Merge pull request !2549 from zyli2020/add_model_script_of_wide_deep_for_gpu
This commit is contained in:
commit
cf0eca5608
|
@ -37,6 +37,7 @@ To train and evaluate the model, command as follows:
|
||||||
python train_and_eval.py
|
python train_and_eval.py
|
||||||
```
|
```
|
||||||
Arguments:
|
Arguments:
|
||||||
|
* `--device_target`: Device where the code will be implemented (Default: Ascend).
|
||||||
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
|
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
|
||||||
* `--epochs`: Total train epochs.
|
* `--epochs`: Total train epochs.
|
||||||
* `--batch_size`: Training batch size.
|
* `--batch_size`: Training batch size.
|
||||||
|
@ -57,6 +58,7 @@ To train the model in one device, command as follows:
|
||||||
python train.py
|
python train.py
|
||||||
```
|
```
|
||||||
Arguments:
|
Arguments:
|
||||||
|
* `--device_target`: Device where the code will be implemented (Default: Ascend).
|
||||||
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
|
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
|
||||||
* `--epochs`: Total train epochs.
|
* `--epochs`: Total train epochs.
|
||||||
* `--batch_size`: Training batch size.
|
* `--batch_size`: Training batch size.
|
||||||
|
@ -87,6 +89,7 @@ To evaluate the model, command as follows:
|
||||||
python eval.py
|
python eval.py
|
||||||
```
|
```
|
||||||
Arguments:
|
Arguments:
|
||||||
|
* `--device_target`: Device where the code will be implemented (Default: Ascend).
|
||||||
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
|
* `--data_path`: This should be set to the same directory given to the data_download's data_dir argument.
|
||||||
* `--epochs`: Total train epochs.
|
* `--epochs`: Total train epochs.
|
||||||
* `--batch_size`: Training batch size.
|
* `--batch_size`: Training batch size.
|
||||||
|
|
|
@ -26,11 +26,11 @@ from src.datasets import create_dataset
|
||||||
from src.metrics import AUCMetric
|
from src.metrics import AUCMetric
|
||||||
from src.config import WideDeepConfig
|
from src.config import WideDeepConfig
|
||||||
|
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target="Davinci",
|
|
||||||
save_graphs=True)
|
|
||||||
|
|
||||||
|
|
||||||
def get_WideDeep_net(config):
|
def get_WideDeep_net(config):
|
||||||
|
"""
|
||||||
|
Get network of wide&deep model.
|
||||||
|
"""
|
||||||
WideDeep_net = WideDeepModel(config)
|
WideDeep_net = WideDeepModel(config)
|
||||||
|
|
||||||
loss_net = NetWithLossClass(WideDeep_net, config)
|
loss_net = NetWithLossClass(WideDeep_net, config)
|
||||||
|
@ -91,4 +91,5 @@ if __name__ == "__main__":
|
||||||
widedeep_config = WideDeepConfig()
|
widedeep_config = WideDeepConfig()
|
||||||
widedeep_config.argparse_init()
|
widedeep_config.argparse_init()
|
||||||
|
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target=widedeep_config.device_target)
|
||||||
test_eval(widedeep_config)
|
test_eval(widedeep_config)
|
||||||
|
|
|
@ -14,7 +14,7 @@
|
||||||
# limitations under the License.
|
# limitations under the License.
|
||||||
# ============================================================================
|
# ============================================================================
|
||||||
|
|
||||||
# bash run_multigpu_train.sh
|
# bash run_multigpu_train.sh RANK_SIZE EPOCH_SIZE DATASET
|
||||||
script_self=$(readlink -f "$0")
|
script_self=$(readlink -f "$0")
|
||||||
self_path=$(dirname "${script_self}")
|
self_path=$(dirname "${script_self}")
|
||||||
RANK_SIZE=$1
|
RANK_SIZE=$1
|
||||||
|
@ -25,4 +25,5 @@ mpirun --allow-run-as-root -n $RANK_SIZE \
|
||||||
python -s ${self_path}/../train_and_eval_distribute.py \
|
python -s ${self_path}/../train_and_eval_distribute.py \
|
||||||
--device_target="GPU" \
|
--device_target="GPU" \
|
||||||
--data_path=$DATASET \
|
--data_path=$DATASET \
|
||||||
|
--batch_size=8000 \
|
||||||
--epochs=$EPOCH_SIZE > log.txt 2>&1 &
|
--epochs=$EPOCH_SIZE > log.txt 2>&1 &
|
||||||
|
|
|
@ -0,0 +1,27 @@
|
||||||
|
#!/bin/bash
|
||||||
|
# Copyright 2020 Huawei Technologies Co., Ltd
|
||||||
|
#
|
||||||
|
# Licensed under the Apache License, Version 2.0 (the "License");
|
||||||
|
# you may not use this file except in compliance with the License.
|
||||||
|
# You may obtain a copy of the License at
|
||||||
|
#
|
||||||
|
# http://www.apache.org/licenses/LICENSE-2.0
|
||||||
|
#
|
||||||
|
# Unless required by applicable law or agreed to in writing, software
|
||||||
|
# distributed under the License is distributed on an "AS IS" BASIS,
|
||||||
|
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
|
||||||
|
# See the License for the specific language governing permissions and
|
||||||
|
# limitations under the License.
|
||||||
|
# ============================================================================
|
||||||
|
|
||||||
|
# bash run_standalone_train_for_gpu.sh EPOCH_SIZE DATASET
|
||||||
|
script_self=$(readlink -f "$0")
|
||||||
|
self_path=$(dirname "${script_self}")
|
||||||
|
EPOCH_SIZE=$1
|
||||||
|
DATASET=$2
|
||||||
|
|
||||||
|
python -s ${self_path}/../train_and_eval.py \
|
||||||
|
--device_target="GPU" \
|
||||||
|
--data_path=$DATASET \
|
||||||
|
--batch_size=16000 \
|
||||||
|
--epochs=$EPOCH_SIZE > log.txt 2>&1 &
|
|
@ -15,16 +15,16 @@
|
||||||
import os
|
import os
|
||||||
from mindspore import Model, context
|
from mindspore import Model, context
|
||||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
|
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
|
||||||
|
|
||||||
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
|
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
|
||||||
from src.callbacks import LossCallBack
|
from src.callbacks import LossCallBack
|
||||||
from src.datasets import create_dataset
|
from src.datasets import create_dataset
|
||||||
from src.config import WideDeepConfig
|
from src.config import WideDeepConfig
|
||||||
|
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target="Ascend", save_graphs=True)
|
|
||||||
|
|
||||||
|
|
||||||
def get_WideDeep_net(configure):
|
def get_WideDeep_net(configure):
|
||||||
|
"""
|
||||||
|
Get network of wide&deep model.
|
||||||
|
"""
|
||||||
WideDeep_net = WideDeepModel(configure)
|
WideDeep_net = WideDeepModel(configure)
|
||||||
|
|
||||||
loss_net = NetWithLossClass(WideDeep_net, configure)
|
loss_net = NetWithLossClass(WideDeep_net, configure)
|
||||||
|
@ -72,7 +72,7 @@ def test_train(configure):
|
||||||
|
|
||||||
model = Model(train_net)
|
model = Model(train_net)
|
||||||
callback = LossCallBack(config=configure)
|
callback = LossCallBack(config=configure)
|
||||||
ckptconfig = CheckpointConfig(save_checkpoint_steps=1,
|
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(),
|
||||||
keep_checkpoint_max=5)
|
keep_checkpoint_max=5)
|
||||||
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig)
|
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train', directory=configure.ckpt_path, config=ckptconfig)
|
||||||
model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb])
|
model.train(epochs, ds_train, callbacks=[TimeMonitor(ds_train.get_dataset_size()), callback, ckpoint_cb])
|
||||||
|
@ -82,4 +82,5 @@ if __name__ == "__main__":
|
||||||
config = WideDeepConfig()
|
config = WideDeepConfig()
|
||||||
config.argparse_init()
|
config.argparse_init()
|
||||||
|
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target=config.device_target)
|
||||||
test_train(config)
|
test_train(config)
|
||||||
|
|
|
@ -15,7 +15,7 @@
|
||||||
import os
|
import os
|
||||||
|
|
||||||
from mindspore import Model, context
|
from mindspore import Model, context
|
||||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
|
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, TimeMonitor
|
||||||
|
|
||||||
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
|
from src.wide_and_deep import PredictWithSigmoid, TrainStepWrap, NetWithLossClass, WideDeepModel
|
||||||
from src.callbacks import LossCallBack, EvalCallBack
|
from src.callbacks import LossCallBack, EvalCallBack
|
||||||
|
@ -23,10 +23,11 @@ from src.datasets import create_dataset
|
||||||
from src.metrics import AUCMetric
|
from src.metrics import AUCMetric
|
||||||
from src.config import WideDeepConfig
|
from src.config import WideDeepConfig
|
||||||
|
|
||||||
context.set_context(mode=context.GRAPH_MODE, device_target="Davinci")
|
|
||||||
|
|
||||||
|
|
||||||
def get_WideDeep_net(config):
|
def get_WideDeep_net(config):
|
||||||
|
"""
|
||||||
|
Get network of wide&deep model.
|
||||||
|
"""
|
||||||
WideDeep_net = WideDeepModel(config)
|
WideDeep_net = WideDeepModel(config)
|
||||||
|
|
||||||
loss_net = NetWithLossClass(WideDeep_net, config)
|
loss_net = NetWithLossClass(WideDeep_net, config)
|
||||||
|
@ -87,11 +88,13 @@ def test_train_eval(config):
|
||||||
|
|
||||||
out = model.eval(ds_eval)
|
out = model.eval(ds_eval)
|
||||||
print("=====" * 5 + "model.eval() initialized: {}".format(out))
|
print("=====" * 5 + "model.eval() initialized: {}".format(out))
|
||||||
model.train(epochs, ds_train, callbacks=[eval_callback, callback, ckpoint_cb])
|
model.train(epochs, ds_train,
|
||||||
|
callbacks=[TimeMonitor(ds_train.get_dataset_size()), eval_callback, callback, ckpoint_cb])
|
||||||
|
|
||||||
|
|
||||||
if __name__ == "__main__":
|
if __name__ == "__main__":
|
||||||
wide_deep_config = WideDeepConfig()
|
wide_deep_config = WideDeepConfig()
|
||||||
wide_deep_config.argparse_init()
|
wide_deep_config.argparse_init()
|
||||||
|
|
||||||
|
context.set_context(mode=context.GRAPH_MODE, device_target=wide_deep_config.device_target)
|
||||||
test_train_eval(wide_deep_config)
|
test_train_eval(wide_deep_config)
|
||||||
|
|
|
@ -40,6 +40,9 @@ init()
|
||||||
|
|
||||||
|
|
||||||
def get_WideDeep_net(config):
|
def get_WideDeep_net(config):
|
||||||
|
"""
|
||||||
|
Get network of wide&deep model.
|
||||||
|
"""
|
||||||
WideDeep_net = WideDeepModel(config)
|
WideDeep_net = WideDeepModel(config)
|
||||||
loss_net = NetWithLossClass(WideDeep_net, config)
|
loss_net = NetWithLossClass(WideDeep_net, config)
|
||||||
loss_net = VirtualDatasetCellTriple(loss_net)
|
loss_net = VirtualDatasetCellTriple(loss_net)
|
||||||
|
|
|
@ -33,6 +33,9 @@ sys.path.append(os.path.dirname(os.path.dirname(os.path.abspath(__file__))))
|
||||||
|
|
||||||
|
|
||||||
def get_WideDeep_net(config):
|
def get_WideDeep_net(config):
|
||||||
|
"""
|
||||||
|
Get network of wide&deep model.
|
||||||
|
"""
|
||||||
WideDeep_net = WideDeepModel(config)
|
WideDeep_net = WideDeepModel(config)
|
||||||
loss_net = NetWithLossClass(WideDeep_net, config)
|
loss_net = NetWithLossClass(WideDeep_net, config)
|
||||||
train_net = TrainStepWrap(loss_net)
|
train_net = TrainStepWrap(loss_net)
|
||||||
|
@ -90,8 +93,12 @@ def train_and_eval(config):
|
||||||
|
|
||||||
callback = LossCallBack(config=config)
|
callback = LossCallBack(config=config)
|
||||||
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
|
ckptconfig = CheckpointConfig(save_checkpoint_steps=ds_train.get_dataset_size(), keep_checkpoint_max=5)
|
||||||
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
|
if config.device_target == "Ascend":
|
||||||
directory=config.ckpt_path, config=ckptconfig)
|
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train',
|
||||||
|
directory=config.ckpt_path, config=ckptconfig)
|
||||||
|
elif config.device_target == "GPU":
|
||||||
|
ckpoint_cb = ModelCheckpoint(prefix='widedeep_train_' + str(get_rank()),
|
||||||
|
directory=config.ckpt_path, config=ckptconfig)
|
||||||
out = model.eval(ds_eval)
|
out = model.eval(ds_eval)
|
||||||
print("=====" * 5 + "model.eval() initialized: {}".format(out))
|
print("=====" * 5 + "model.eval() initialized: {}".format(out))
|
||||||
model.train(epochs, ds_train,
|
model.train(epochs, ds_train,
|
||||||
|
|
Loading…
Reference in New Issue