add distribute train for vgg16

This commit is contained in:
caojian05 2020-04-24 18:46:09 +08:00
parent a02eb240e9
commit fd511d0729
3 changed files with 80 additions and 8 deletions

View File

@ -28,7 +28,11 @@ def create_dataset(data_home, repeat_num=1, training=True):
data_dir = os.path.join(data_home, "cifar-10-batches-bin")
if not training:
data_dir = os.path.join(data_home, "cifar-10-verify-bin")
data_set = ds.Cifar10Dataset(data_dir)
rank_size = int(os.environ.get("RANK_SIZE")) if os.environ.get("RANK_SIZE") else None
rank_id = int(os.environ.get("RANK_ID")) if os.environ.get("RANK_ID") else None
data_set = ds.Cifar10Dataset(data_dir, num_shards=rank_size, shard_id=rank_id)
resize_height = cfg.image_height
resize_width = cfg.image_width
rescale = 1.0 / 255.0

View File

@ -0,0 +1,53 @@
#!/bin/bash
# Copyright 2020 Huawei Technologies Co., Ltd
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
# ============================================================================
if [ $# != 2 ]
then
echo "Usage: sh run_distribute_train.sh [MINDSPORE_HCCL_CONFIG_PATH] [DATA_PATH]"
exit 1
fi
if [ ! -f $1 ]
then
echo "error: MINDSPORE_HCCL_CONFIG_PATH=$1 is not a file"
exit 1
fi
if [ ! -d $2 ]
then
echo "error: DATA_PATH=$2 is not a directory"
exit 1
fi
ulimit -u unlimited
export DEVICE_NUM=8
export RANK_SIZE=8
export MINDSPORE_HCCL_CONFIG_PATH=$1
for((i=0; i<${DEVICE_NUM}; i++))
do
export DEVICE_ID=$i
export RANK_ID=$i
rm -rf ./train_parallel$i
mkdir ./train_parallel$i
cp *.py ./train_parallel$i
cp *.sh ./train_parallel$i
cd ./train_parallel$i || exit
echo "start training for rank $RANK_ID, device $DEVICE_ID"
env > env.log
python train.py --data_path=$2 --device_id=$i &> log &
cd ..
done

View File

@ -17,16 +17,18 @@
python train.py --data_path=$DATA_HOME --device_id=$DEVICE_ID
"""
import argparse
import os
import random
import numpy as np
import mindspore.nn as nn
from mindspore import Tensor
from mindspore.communication.management import init
from mindspore.nn.optim.momentum import Momentum
from mindspore.train.model import Model
from mindspore.train.model import Model, ParallelMode
from mindspore import context
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig, LossMonitor, TimeMonitor
from mindspore.model_zoo.vgg import vgg16
import dataset
from dataset import create_dataset
from config import cifar_cfg as cfg
random.seed(1)
np.random.seed(1)
@ -62,18 +64,31 @@ if __name__ == '__main__':
context.set_context(mode=context.GRAPH_MODE, device_target=args_opt.device_target)
context.set_context(device_id=args_opt.device_id)
context.set_context(enable_task_sink=True)
context.set_context(enable_loop_sink=True)
context.set_context(enable_mem_reuse=True, enable_hccl=False)
device_num = int(os.environ.get("DEVICE_NUM", 1))
if device_num > 1:
context.reset_auto_parallel_context()
context.set_context(enable_hccl=True)
context.set_auto_parallel_context(device_num=device_num, parallel_mode=ParallelMode.DATA_PARALLEL,
mirror_mean=True)
init()
dataset = create_dataset(args_opt.data_path, cfg.epoch_size)
batch_num = dataset.get_dataset_size()
net = vgg16(num_classes=cfg.num_classes)
lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=50000 // cfg.batch_size)
lr = lr_steps(0, lr_max=cfg.lr_init, total_epochs=cfg.epoch_size, steps_per_epoch=batch_num)
opt = Momentum(filter(lambda x: x.requires_grad, net.get_parameters()), Tensor(lr), cfg.momentum, weight_decay=cfg.weight_decay)
loss = nn.SoftmaxCrossEntropyWithLogits(sparse=True, reduction='mean', is_grad=False)
model = Model(net, loss_fn=loss, optimizer=opt, metrics={'acc'},
amp_level="O2", keep_batchnorm_fp32=False, loss_scale_manager=None)
dataset = dataset.create_dataset(args_opt.data_path, cfg.epoch_size)
batch_num = dataset.get_dataset_size()
config_ck = CheckpointConfig(save_checkpoint_steps=batch_num * 5, keep_checkpoint_max=cfg.keep_checkpoint_max)
time_cb = TimeMonitor(data_size=batch_num)
ckpoint_cb = ModelCheckpoint(prefix="train_vgg_cifar10", directory="./", config=config_ck)
loss_cb = LossMonitor()
model.train(cfg.epoch_size, dataset, callbacks=[ckpoint_cb, loss_cb])
model.train(cfg.epoch_size, dataset, callbacks=[time_cb, ckpoint_cb, loss_cb])
print("train success")