forked from mindspore-Ecosystem/mindspore
!23005 fix mobilenetv2 bugs on GPU
Merge pull request !23005 from zhaoting/master
This commit is contained in:
commit
a4ee2145a6
|
@ -38,6 +38,7 @@ device_id: 0
|
|||
rank_id: 0
|
||||
rank_size: 1
|
||||
run_distribute: False
|
||||
run_eval: False
|
||||
activation: "Softmax"
|
||||
|
||||
# Image classification trian. train_parse_args():return train_args
|
||||
|
@ -86,6 +87,7 @@ file_name: "output file name."
|
|||
result_path: "result files path."
|
||||
label_path: "label path."
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
run_eval: 'Whether run evaluation while training, default is false.'
|
||||
run_distribute: 'Run distribute, default is false.'
|
||||
device_id: 'Device id, default is 0.'
|
||||
rank_id: 'Rank id, default is 0.'
|
||||
|
|
|
@ -38,6 +38,7 @@ device_id: 0
|
|||
rank_id: 0
|
||||
rank_size: 1
|
||||
run_distribute: False
|
||||
run_eval: True
|
||||
activation: "Softmax"
|
||||
|
||||
# Image classification trian. train_parse_args():return train_args
|
||||
|
@ -86,6 +87,7 @@ file_name: "output file name."
|
|||
result_path: "result files path."
|
||||
label_path: "label path."
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
run_eval: 'Whether run evaluation while training, default is false.'
|
||||
run_distribute: 'Run distribute, default is false.'
|
||||
device_id: 'Device id, default is 0.'
|
||||
rank_id: 'Rank id, default is 0.'
|
||||
|
|
|
@ -34,7 +34,11 @@ save_checkpoint_epochs: 1
|
|||
keep_checkpoint_max: 20
|
||||
save_checkpoint_path: "./"
|
||||
platform: 'CPU'
|
||||
device_id: 0
|
||||
rank_id: 0
|
||||
rank_size: 1
|
||||
run_distribute: False
|
||||
run_eval: False
|
||||
activation: "Softmax"
|
||||
|
||||
# Image classification trian. train_parse_args():return train_args
|
||||
|
@ -83,6 +87,7 @@ file_name: "output file name."
|
|||
result_path: "result files path."
|
||||
label_path: "label path."
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
run_eval: 'Whether run evaluation while training, default is false.'
|
||||
run_distribute: 'Run distribute, default is false.'
|
||||
device_id: 'Device id, default is 0.'
|
||||
rank_id: 'Rank id, default is 0.'
|
||||
|
|
|
@ -34,6 +34,9 @@ save_checkpoint_epochs: 1
|
|||
keep_checkpoint_max: 200
|
||||
save_checkpoint_path: "./"
|
||||
platform: 'GPU'
|
||||
device_id: 0
|
||||
rank_id: 0
|
||||
rank_size: 1
|
||||
run_distribute: True
|
||||
activation: "Softmax"
|
||||
|
||||
|
@ -57,6 +60,7 @@ ckpt_file: "/cache/train/mobilenetv2-200_625.ckpt"
|
|||
file_name: "mobilenetv2"
|
||||
file_format: "MINDIR"
|
||||
is_training_export: False
|
||||
run_eval: False
|
||||
run_distribute_export: False
|
||||
|
||||
# postprocess.py / mobilenetv2 acc calculation
|
||||
|
@ -83,6 +87,7 @@ file_name: "output file name."
|
|||
result_path: "result files path."
|
||||
label_path: "label path."
|
||||
enable_profiling: 'Whether enable profiling while training, default: False'
|
||||
run_eval: 'Whether run evaluation while training, default is false.'
|
||||
run_distribute: 'Run distribute, default is false.'
|
||||
device_id: 'Device id, default is 0.'
|
||||
rank_id: 'Rank id, default is 0.'
|
||||
|
|
|
@ -19,21 +19,14 @@ import time
|
|||
import os
|
||||
from mindspore import nn
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.common import dtype as mstype
|
||||
|
||||
from src.dataset import create_dataset
|
||||
from src.models import define_net, load_ckpt
|
||||
from src.utils import switch_precision, set_context
|
||||
from src.utils import context_device_init
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
|
||||
|
||||
from src.model_utils.device_adapter import get_device_id, get_device_num
|
||||
|
||||
config.is_training = config.is_training_eval
|
||||
config.device_id = get_device_id()
|
||||
config.rank_id = get_rank_id()
|
||||
config.rank_size = get_device_num()
|
||||
config.run_distribute = config.rank_size > 1.
|
||||
|
||||
def modelarts_process():
|
||||
""" modelarts process """
|
||||
|
@ -96,13 +89,13 @@ def modelarts_process():
|
|||
def eval_mobilenetv2():
|
||||
config.dataset_path = os.path.join(config.dataset_path, 'validation_preprocess')
|
||||
print('\nconfig: \n', config)
|
||||
set_context(config)
|
||||
if not config.device_id:
|
||||
config.device_id = get_device_id()
|
||||
context_device_init(config)
|
||||
_, _, net = define_net(config, config.is_training)
|
||||
|
||||
load_ckpt(net, config.pretrain_ckpt)
|
||||
|
||||
switch_precision(net, mstype.float16, config)
|
||||
|
||||
dataset = create_dataset(dataset_path=config.dataset_path, do_train=False, config=config)
|
||||
step_size = dataset.get_dataset_size()
|
||||
if step_size == 0:
|
||||
|
|
|
@ -16,25 +16,16 @@
|
|||
mobilenetv2 export file.
|
||||
"""
|
||||
import numpy as np
|
||||
from mindspore import Tensor, export, context
|
||||
from mindspore import Tensor, export
|
||||
from src.models import define_net, load_ckpt
|
||||
from src.utils import set_context
|
||||
from src.utils import context_device_init
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
|
||||
from src.model_utils.device_adapter import get_device_id
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
|
||||
|
||||
config.device_id = get_device_id()
|
||||
config.rank_id = get_rank_id()
|
||||
config.rank_size = get_device_num()
|
||||
config.run_distribute = config.rank_size > 1.
|
||||
config.batch_size = config.batch_size_export
|
||||
config.is_training = config.is_training_export
|
||||
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.platform)
|
||||
if config.platform == "Ascend":
|
||||
context.set_context(device_id=get_device_id())
|
||||
|
||||
def modelarts_process():
|
||||
pass
|
||||
|
||||
|
@ -42,7 +33,9 @@ def modelarts_process():
|
|||
def export_mobilenetv2():
|
||||
""" export_mobilenetv2 """
|
||||
print('\nconfig: \n', config)
|
||||
set_context(config)
|
||||
if not config.device_id:
|
||||
config.device_id = get_device_id()
|
||||
context_device_init(config)
|
||||
_, _, net = define_net(config, config.is_training)
|
||||
|
||||
load_ckpt(net, config.ckpt_file)
|
||||
|
|
|
@ -46,7 +46,10 @@ run_ascend()
|
|||
echo "error: DATASET_PATH=$6 is not a directory or file"
|
||||
exit 1
|
||||
fi
|
||||
|
||||
RUN_DISTRIBUTE=True
|
||||
if [ $2 -eq 1 ] ; then
|
||||
RUN_DISTRIBUTE=False
|
||||
fi
|
||||
BASEPATH=$(cd "`dirname $0`" || exit; pwd)
|
||||
CONFIG_FILE="${BASEPATH}/../$2"
|
||||
|
||||
|
@ -85,6 +88,7 @@ run_ascend()
|
|||
echo "start training for rank $RANK_ID, device $DEVICE_ID"
|
||||
env > env.log
|
||||
taskset -c $cmdopt python train.py \
|
||||
--run_distribute=$RUN_DISTRIBUTE \
|
||||
--config_path=$CONFIG_FILE \
|
||||
--platform=$1 \
|
||||
--dataset_path=$6 \
|
||||
|
|
|
@ -49,31 +49,8 @@ def create_dataset(dataset_path, do_train, config, repeat_num=1, enable_cache=Fa
|
|||
nfs_dataset_cache = None
|
||||
|
||||
num_workers = config.num_workers
|
||||
if config.platform == "Ascend":
|
||||
rank_size = int(os.getenv("RANK_SIZE", '1'))
|
||||
rank_id = int(os.getenv("RANK_ID", '0'))
|
||||
if rank_size == 1:
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_workers, shuffle=True,
|
||||
cache=nfs_dataset_cache)
|
||||
else:
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_workers, shuffle=True,
|
||||
num_shards=rank_size, shard_id=rank_id, cache=nfs_dataset_cache)
|
||||
elif config.platform == "GPU":
|
||||
if do_train:
|
||||
if config.run_distribute:
|
||||
from mindspore.communication.management import get_rank, get_group_size
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_workers, shuffle=True,
|
||||
num_shards=get_group_size(), shard_id=get_rank(),
|
||||
cache=nfs_dataset_cache)
|
||||
else:
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_workers, shuffle=True,
|
||||
cache=nfs_dataset_cache)
|
||||
else:
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_workers, shuffle=True,
|
||||
cache=nfs_dataset_cache)
|
||||
elif config.platform == "CPU":
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_workers, \
|
||||
shuffle=True, cache=nfs_dataset_cache)
|
||||
data_set = ds.ImageFolderDataset(dataset_path, num_parallel_workers=num_workers, shuffle=do_train,
|
||||
num_shards=config.rank_size, shard_id=config.rank_id, cache=nfs_dataset_cache)
|
||||
|
||||
resize_height = config.image_height
|
||||
resize_width = config.image_width
|
||||
|
|
|
@ -42,14 +42,16 @@ class ClassifyCorrectCell(nn.Cell):
|
|||
>>> eval_net = nn.ClassifyCorrectCell(net)
|
||||
"""
|
||||
|
||||
def __init__(self, network):
|
||||
def __init__(self, network, run_distribute):
|
||||
super(ClassifyCorrectCell, self).__init__(auto_prefix=False)
|
||||
self._network = network
|
||||
self.argmax = P.Argmax()
|
||||
self.equal = P.Equal()
|
||||
self.cast = P.Cast()
|
||||
self.reduce_sum = P.ReduceSum()
|
||||
self.allreduce = P.AllReduce(P.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
|
||||
self.run_distribute = run_distribute
|
||||
if run_distribute:
|
||||
self.allreduce = P.AllReduce(P.ReduceOp.SUM, GlobalComm.WORLD_COMM_GROUP)
|
||||
|
||||
def construct(self, data, label):
|
||||
outputs = self._network(data)
|
||||
|
@ -58,8 +60,9 @@ class ClassifyCorrectCell(nn.Cell):
|
|||
y_correct = self.equal(y_pred, label)
|
||||
y_correct = self.cast(y_correct, mstype.float32)
|
||||
y_correct = self.reduce_sum(y_correct)
|
||||
total_correct = self.allreduce(y_correct)
|
||||
return (total_correct,)
|
||||
if self.run_distribute:
|
||||
y_correct = self.allreduce(y_correct)
|
||||
return (y_correct,)
|
||||
|
||||
|
||||
class DistAccuracy(nn.Metric):
|
||||
|
|
|
@ -14,72 +14,40 @@
|
|||
# ============================================================================
|
||||
|
||||
from mindspore import context
|
||||
from mindspore import nn
|
||||
from mindspore.common import dtype as mstype
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.train.callback import ModelCheckpoint, CheckpointConfig
|
||||
from mindspore.communication.management import get_rank, init, get_group_size
|
||||
|
||||
from src.models import Monitor
|
||||
|
||||
|
||||
def switch_precision(net, data_type, config):
|
||||
if config.platform == "Ascend":
|
||||
net.to_float(data_type)
|
||||
for _, cell in net.cells_and_names():
|
||||
if isinstance(cell, nn.Dense):
|
||||
cell.to_float(mstype.float32)
|
||||
|
||||
|
||||
def context_device_init(config):
|
||||
if config.platform == "GPU" and config.run_distribute:
|
||||
config.device_id = 0
|
||||
config.rank_id = 0
|
||||
config.rank_size = 1
|
||||
if config.platform == "CPU":
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.platform, save_graphs=False)
|
||||
|
||||
elif config.platform == "GPU":
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.platform, save_graphs=False)
|
||||
if config.run_distribute:
|
||||
init()
|
||||
context.set_auto_parallel_context(device_num=get_group_size(),
|
||||
parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True)
|
||||
|
||||
elif config.platform == "Ascend":
|
||||
elif config.platform in ["Ascend", "GPU"]:
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.platform, device_id=config.device_id,
|
||||
save_graphs=False)
|
||||
if config.run_distribute:
|
||||
init()
|
||||
config.rank_id = get_rank()
|
||||
config.rank_size = get_group_size()
|
||||
context.set_auto_parallel_context(device_num=config.rank_size,
|
||||
parallel_mode=ParallelMode.DATA_PARALLEL,
|
||||
gradients_mean=True, all_reduce_fusion_config=[140])
|
||||
init()
|
||||
gradients_mean=True)
|
||||
else:
|
||||
raise ValueError("Only support CPU, GPU and Ascend.")
|
||||
|
||||
|
||||
def set_context(config):
|
||||
if config.platform == "CPU":
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.platform,
|
||||
save_graphs=False)
|
||||
elif config.platform == "Ascend":
|
||||
context.set_context(mode=context.GRAPH_MODE, device_target=config.platform,
|
||||
device_id=config.device_id, save_graphs=False)
|
||||
elif config.platform == "GPU":
|
||||
context.set_context(mode=context.GRAPH_MODE,
|
||||
device_target=config.platform, save_graphs=False)
|
||||
|
||||
|
||||
def config_ckpoint(config, lr, step_size, model=None, eval_dataset=None):
|
||||
cb = [Monitor(lr_init=lr.asnumpy(), model=model, eval_dataset=eval_dataset)]
|
||||
if config.platform in ("CPU", "GPU") or config.rank_id == 0:
|
||||
|
||||
if config.save_checkpoint:
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
|
||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
|
||||
rank = 0
|
||||
if config.run_distribute:
|
||||
rank = get_rank()
|
||||
|
||||
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(rank) + "/"
|
||||
ckpt_cb = ModelCheckpoint(prefix="mobilenetv2", directory=ckpt_save_dir, config=config_ck)
|
||||
cb += [ckpt_cb]
|
||||
if config.save_checkpoint and config.rank_id == 0:
|
||||
config_ck = CheckpointConfig(save_checkpoint_steps=config.save_checkpoint_epochs * step_size,
|
||||
keep_checkpoint_max=config.keep_checkpoint_max)
|
||||
ckpt_save_dir = config.save_checkpoint_path + "ckpt_" + str(config.rank_id) + "/"
|
||||
ckpt_cb = ModelCheckpoint(prefix="mobilenetv2", directory=ckpt_save_dir, config=config_ck)
|
||||
cb += [ckpt_cb]
|
||||
return cb
|
||||
|
|
|
@ -24,7 +24,6 @@ from mindspore import Tensor
|
|||
from mindspore.nn import WithLossCell, TrainOneStepCell
|
||||
from mindspore.nn.optim.momentum import Momentum
|
||||
from mindspore.nn.loss import SoftmaxCrossEntropyWithLogits
|
||||
from mindspore.communication.management import get_rank
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.loss_scale_manager import FixedLossScaleManager
|
||||
from mindspore.train.serialization import save_checkpoint
|
||||
|
@ -37,7 +36,7 @@ from src.models import CrossEntropyWithLabelSmooth, define_net, load_ckpt
|
|||
from src.metric import DistAccuracy, ClassifyCorrectCell
|
||||
from src.model_utils.config import config
|
||||
from src.model_utils.moxing_adapter import moxing_wrapper
|
||||
from src.model_utils.device_adapter import get_device_id, get_device_num, get_rank_id
|
||||
from src.model_utils.device_adapter import get_device_id, get_device_num
|
||||
|
||||
|
||||
set_seed(1)
|
||||
|
@ -116,23 +115,16 @@ def build_params_groups(net):
|
|||
def train_mobilenetv2():
|
||||
config.train_dataset_path = os.path.join(config.dataset_path, 'train')
|
||||
config.eval_dataset_path = os.path.join(config.dataset_path, 'validation_preprocess')
|
||||
|
||||
config.device_id = get_device_id()
|
||||
config.rank_id = get_rank_id()
|
||||
config.rank_size = get_device_num()
|
||||
if config.platform == 'Ascend':
|
||||
config.run_distribute = config.rank_size > 1.
|
||||
|
||||
print('\nconfig: {} \n'.format(config))
|
||||
if not config.device_id:
|
||||
config.device_id = get_device_id()
|
||||
start = time.time()
|
||||
# set context and device init
|
||||
context_device_init(config)
|
||||
|
||||
print('\nconfig: {} \n'.format(config))
|
||||
# define network
|
||||
backbone_net, head_net, net = define_net(config, config.is_training)
|
||||
dataset = create_dataset(dataset_path=config.train_dataset_path, do_train=True, config=config,
|
||||
enable_cache=config.enable_cache, cache_session_id=config.cache_session_id)
|
||||
eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, config=config)
|
||||
step_size = dataset.get_dataset_size()
|
||||
if config.platform == "GPU":
|
||||
context.set_context(enable_graph_kernel=True)
|
||||
|
@ -165,23 +157,27 @@ def train_mobilenetv2():
|
|||
warmup_epochs=config.warmup_epochs,
|
||||
total_epochs=epoch_size,
|
||||
steps_per_epoch=step_size))
|
||||
|
||||
metrics = {"acc"}
|
||||
dist_eval_network = None
|
||||
eval_dataset = None
|
||||
if config.run_eval:
|
||||
metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.rank_size)}
|
||||
dist_eval_network = ClassifyCorrectCell(net, config.run_distribute)
|
||||
eval_dataset = create_dataset(dataset_path=config.eval_dataset_path, do_train=False, config=config)
|
||||
if config.pretrain_ckpt == "" or config.freeze_layer != "backbone":
|
||||
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||
group_params = build_params_groups(net)
|
||||
opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale)
|
||||
|
||||
metrics = {"acc"}
|
||||
dist_eval_network = None
|
||||
if config.run_distribute:
|
||||
metrics = {'acc': DistAccuracy(batch_size=config.batch_size, device_num=config.rank_size)}
|
||||
dist_eval_network = ClassifyCorrectCell(net)
|
||||
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
|
||||
metrics=metrics, eval_network=dist_eval_network,
|
||||
amp_level="O2", keep_batchnorm_fp32=False,
|
||||
acc_level=config.acc_mode)
|
||||
if config.platform == "Ascend":
|
||||
loss_scale = FixedLossScaleManager(config.loss_scale, drop_overflow_update=False)
|
||||
group_params = build_params_groups(net)
|
||||
opt = Momentum(group_params, lr, config.momentum, loss_scale=config.loss_scale)
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, loss_scale_manager=loss_scale,
|
||||
metrics=metrics, eval_network=dist_eval_network,
|
||||
amp_level="O2", keep_batchnorm_fp32=False,
|
||||
acc_level=config.acc_mode)
|
||||
|
||||
else:
|
||||
opt = Momentum(net.trainable_params(), lr, config.momentum, config.weight_decay)
|
||||
model = Model(net, loss_fn=loss, optimizer=opt, metrics=metrics, eval_network=dist_eval_network,
|
||||
acc_level=config.acc_mode)
|
||||
cb = config_ckpoint(config, lr, step_size, model, eval_dataset)
|
||||
print("============== Starting Training ==============")
|
||||
model.train(epoch_size, dataset, callbacks=cb)
|
||||
|
@ -197,9 +193,7 @@ def train_mobilenetv2():
|
|||
|
||||
features_path = config.train_dataset_path + '_features'
|
||||
idx_list = list(range(step_size))
|
||||
rank = 0
|
||||
if config.run_distribute:
|
||||
rank = get_rank()
|
||||
rank = config.rank_id
|
||||
save_ckpt_path = os.path.join(config.save_checkpoint_path, 'ckpt_' + str(rank) + '/')
|
||||
if not os.path.isdir(save_ckpt_path):
|
||||
os.mkdir(save_ckpt_path)
|
||||
|
|
Loading…
Reference in New Issue