optimize FaceRecognitionForTracking training speed

This commit is contained in:
zhouneng2 2021-07-21 10:42:44 +08:00
parent c168ecce09
commit 900375eeb6
4 changed files with 22 additions and 9 deletions

View File

@ -46,7 +46,7 @@ world_size: 8
# logging related
log_interval: 10
ckpt_path: '../../output'
ckpt_interval: 200
ckpt_interval: 400
# train/eval option
data_dir: ''

View File

@ -70,6 +70,10 @@ echo $PRETRAINED_BACKBONE
export RANK_TABLE_FILE=$RANK_TABLE
export RANK_SIZE=8
cpus=`cat /proc/cpuinfo| grep "processor"| wc -l`
avg=`expr $cpus \/ $RANK_SIZE`
gap=`expr $avg \- 1`
config_path="${dirname_path}/reid_8p_ascend_config.yaml"
echo "config path is : ${config_path}"
@ -77,12 +81,15 @@ echo 'start training'
for((i=0;i<=$RANK_SIZE-1;i++));
do
echo 'start rank '$i
start=`expr $i \* $avg`
end=`expr $start \+ $gap`
cmdopt=$start"-"$end
mkdir ${current_exec_path}/device$i
cd ${current_exec_path}/device$i || exit
export RANK_ID=$i
dev=`expr $i + 0`
export DEVICE_ID=$dev
python ${dirname_path}/${SCRIPT_NAME} \
taskset -c $cmdopt python ${dirname_path}/${SCRIPT_NAME} \
--config_path=$config_path \
--is_distributed=1 \
--data_dir=$DATA_DIR \

View File

@ -38,9 +38,9 @@ def get_de_dataset(args):
VC.HWC2CHW()]
de_dataset = de.ImageFolderDataset(dataset_dir=args.data_dir, num_shards=args.world_size,
shard_id=args.local_rank, shuffle=True)
de_dataset = de_dataset.map(input_columns="image", operations=transform_img)
de_dataset = de_dataset.map(input_columns="label", operations=transform_label)
shard_id=args.local_rank, shuffle=True, num_parallel_workers=4)
de_dataset = de_dataset.map(input_columns="image", operations=transform_img, num_parallel_workers=4)
de_dataset = de_dataset.map(input_columns="label", operations=transform_label, num_parallel_workers=4)
de_dataset = de_dataset.project(columns=["image", "label"])
de_dataset = de_dataset.batch(args.per_batch_size, drop_remainder=True)

View File

@ -22,7 +22,6 @@ import numpy as np
import mindspore
from mindspore import context
from mindspore import Tensor
from mindspore.context import ParallelMode
from mindspore.train.serialization import load_checkpoint, load_param_into_net
from mindspore.train.callback import ModelCheckpoint, RunContext, _InternalCallbackParam, CheckpointConfig
@ -67,6 +66,9 @@ def init_argument():
context.set_auto_parallel_context(parallel_mode=parallel_mode, device_num=config.world_size,
gradients_mean=True)
if config.device_target == 'Ascend' and config.is_distributed:
context.set_auto_parallel_context(all_reduce_fusion_config=[1, 10])
mindspore.common.set_seed(1)
# logger
@ -141,7 +143,13 @@ def run_train():
de_dataset, steps_per_epoch, class_num = get_de_dataset(cfg)
cfg.steps_per_epoch = steps_per_epoch
cfg.logger.info('step per epoch: %s', cfg.steps_per_epoch)
de_dataloader = de_dataset.create_tuple_iterator()
# increase training speed for Ascend and distribute mode
if config.device_target == 'Ascend' and config.is_distributed:
de_dataloader = de_dataset.create_tuple_iterator(do_copy=False)
else:
de_dataloader = de_dataset.create_tuple_iterator()
cfg.logger.info('class num original: %s', class_num)
if class_num % 16 != 0:
class_num = (class_num // 16 + 1) * 16
@ -214,8 +222,6 @@ def run_train():
cfg.logger.important_info('====start train====')
for i, total_data in enumerate(de_dataloader):
data, gt = total_data
data = Tensor(data)
gt = Tensor(gt)
loss = train_net(data, gt)
loss_meter.update(loss.asnumpy())