From 659839a2500cd9717a676466ba7426006e517f89 Mon Sep 17 00:00:00 2001 From: linqingke Date: Sat, 19 Sep 2020 10:44:55 +0800 Subject: [PATCH] densenet121 and mass fix. --- model_zoo/official/cv/densenet121/README.md | 14 ++++---- .../scripts/run_distribute_train.sh | 13 ++++--- .../official/cv/densenet121/src/config.py | 2 +- .../cv/psenet/scripts/run_distribute_train.sh | 1 - model_zoo/official/cv/psenet/src/config.py | 4 +-- model_zoo/official/cv/psenet/src/dataset.py | 36 ++++++++++++++++--- model_zoo/official/cv/psenet/train.py | 2 +- model_zoo/official/nlp/mass/train.py | 4 ++- 8 files changed, 55 insertions(+), 21 deletions(-) diff --git a/model_zoo/official/cv/densenet121/README.md b/model_zoo/official/cv/densenet121/README.md index ebaf8afd121..f78d67a3fcc 100644 --- a/model_zoo/official/cv/densenet121/README.md +++ b/model_zoo/official/cv/densenet121/README.md @@ -18,7 +18,7 @@ - [Model Description](#model-description) - [Performance](#performance) - [Training accuracy results](#training-accuracy-results) - - [Training performance results](#yraining-performance-results) + - [Training performance results](#training-performance-results) - [Description of Random Situation](#description-of-random-situation) - [ModelZoo Homepage](#modelzoo-homepage) @@ -80,13 +80,13 @@ After installing MindSpore via the official website, you can start training and ```python # run training example - python train.py --data_dir /PATH/TO/DATASET --is_distributed 0> train.log 2>&1 & + python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & # run distributed training example - sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET + sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT # run evaluation example - python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & + python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & OR sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT ``` @@ -168,7 +168,7 @@ You can modify the training behaviour through the various flags in the `train.py - running on Ascend ``` - python train.py --data_dir /PATH/TO/DATASET --is_distributed 0 > train.log 2>&1 & + python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & ``` The python command above will run in the background, The log and model checkpoint will be generated in `output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: @@ -190,7 +190,7 @@ You can modify the training behaviour through the various flags in the `train.py - running on Ascend ``` - sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET + sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT ``` The above shell script will run distribute training in the background. You can view the results log and model checkpoint through the file `train[X]/output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows: @@ -217,7 +217,7 @@ You can modify the training behaviour through the various flags in the `train.py running the command below for evaluation. ``` - python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & + python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & OR sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT ``` diff --git a/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh b/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh index 8a5aac8ab96..51f16649b16 100644 --- a/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh @@ -16,8 +16,8 @@ echo "==============================================================================================================" echo "Please run the scipt as: " -echo "sh scipts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET" -echo "for example: sh scipts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset" +echo "sh scripts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET CKPT_FILE" +echo "for example: sh scripts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset ckpt_file" echo "It is better to use absolute path." echo "=================================================================================================================" @@ -26,6 +26,7 @@ echo "After running the scipt, the network runs in the background. The log will export RANK_SIZE=$1 export RANK_TABLE_FILE=$2 DATASET=$3 +CKPT_FILE=$4 for((i=0;i env.log - python train.py \ - --data_dir=$DATASET > log.txt 2>&1 & + if [ -f $CKPT_FILE ] + then + python train.py --data_dir=$DATASET --pretrained=$CKPT_FILE > log.txt 2>&1 & + else + python train.py --data_dir=$DATASET > log.txt 2>&1 & + fi cd ../ done diff --git a/model_zoo/official/cv/densenet121/src/config.py b/model_zoo/official/cv/densenet121/src/config.py index b925ac7d94b..f90dea09bb3 100644 --- a/model_zoo/official/cv/densenet121/src/config.py +++ b/model_zoo/official/cv/densenet121/src/config.py @@ -37,7 +37,7 @@ config = ed({ "label_smooth_factor": 0.1, "log_interval": 100, - "ckpt_interval": 2000, + "ckpt_interval": 50000, "ckpt_path": 'outputs/', "is_save_on_master": 1, diff --git a/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh b/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh index c48bb04a024..147a36610cb 100644 --- a/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh +++ b/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh @@ -41,7 +41,6 @@ fi python ${current_exec_path}/src/generate_hccn_file.py -ulimit -u unlimited export DEVICE_NUM=4 export RANK_SIZE=4 export RANK_TABLE_FILE=${current_exec_path}/rank_table_4p.json diff --git a/model_zoo/official/cv/psenet/src/config.py b/model_zoo/official/cv/psenet/src/config.py index e7d28b55734..8e969a8ac81 100644 --- a/model_zoo/official/cv/psenet/src/config.py +++ b/model_zoo/official/cv/psenet/src/config.py @@ -30,7 +30,7 @@ config = ed({ 'NECK_OUT_CHANNEL': 256, # dataset for train - "TRAIN_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/', + "TRAIN_ROOT_DIR": 'psenet/ic15/', "TRAIN_IS_TRANSFORM": True, "TRAIN_LONG_SIZE": 640, "TRAIN_DATASET_SIZE": 1000, @@ -43,7 +43,7 @@ config = ed({ "TRAIN_MODEL_SAVE_PATH": './checkpoints/', # dataset for test - "TEST_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/', + "TEST_ROOT_DIR": 'psenet/ic15/', "TEST_DATASET_SIZE": 500, "TEST_BUFFER_SIZE": 4, "TEST_DROP_REMAINDER": False, diff --git a/model_zoo/official/cv/psenet/src/dataset.py b/model_zoo/official/cv/psenet/src/dataset.py index 42d29b5f7b3..df373174f4f 100644 --- a/model_zoo/official/cv/psenet/src/dataset.py +++ b/model_zoo/official/cv/psenet/src/dataset.py @@ -16,6 +16,7 @@ import os import random +import math import cv2 import pyclipper import numpy as np @@ -298,13 +299,40 @@ def IC15_TEST_Generator(): yield img, img_resized, img_name -def train_dataset_creator(): +class DistributedSampler(): + def __init__(self, dataset, rank, group_size, shuffle=True, seed=0): + self.dataset = dataset + self.rank = rank + self.group_size = group_size + self.dataset_len = len(self.dataset) + self.num_samplers = int(math.ceil(self.dataset_len * 1.0 / self.group_size)) + self.total_size = self.num_samplers * self.group_size + self.shuffle = shuffle + self.seed = seed + + def __iter__(self): + if self.shuffle: + self.seed = (self.seed + 1) & 0xffffffff + np.random.seed(self.seed) + indices = np.random.permutation(self.dataset_len).tolist() + else: + indices = list(range(len(self.dataset_len))) + + indices += indices[:(self.total_size - len(indices))] + indices = indices[self.rank::self.group_size] + return iter(indices) + + def __len__(self): + return self.num_samplers + +def train_dataset_creator(rank, group_size, shuffle=True): cv2.setNumThreads(0) dataset = TrainDataset() - ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8) - #ds = ds.repeat(config.TRAIN_REPEAT_NUM) + sampler = DistributedSampler(dataset, rank, group_size, shuffle) + ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8, + sampler=sampler) + ds = ds.repeat(1) ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER) - ds = ds.shuffle(buffer_size=config.TRAIN_BUFFER_SIZE) return ds def test_dataset_creator(): diff --git a/model_zoo/official/cv/psenet/train.py b/model_zoo/official/cv/psenet/train.py index 897e21d54e9..513e284950e 100644 --- a/model_zoo/official/cv/psenet/train.py +++ b/model_zoo/official/cv/psenet/train.py @@ -54,7 +54,7 @@ def train(): rank_id = get_rank() # dataset/network/criterion/optim - ds = train_dataset_creator() + ds = train_dataset_creator(args.device_id, args.device_num) step_size = ds.get_dataset_size() print('Create dataset done!') diff --git a/model_zoo/official/nlp/mass/train.py b/model_zoo/official/nlp/mass/train.py index d0f93c46840..9b6a9e5ae14 100644 --- a/model_zoo/official/nlp/mass/train.py +++ b/model_zoo/official/nlp/mass/train.py @@ -25,7 +25,7 @@ from mindspore.nn import Momentum from mindspore.nn.optim import Adam, Lamb from mindspore.train.model import Model from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager -from mindspore.train.callback import CheckpointConfig, ModelCheckpoint +from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor from mindspore import context, Parameter from mindspore.context import ParallelMode from mindspore.communication import management as MultiAscend @@ -216,11 +216,13 @@ def _build_training_pipeline(config: TransformerConfig, scale_update_cell=scale_manager.get_update_cell()) net_with_grads.set_train(True) model = Model(net_with_grads) + time_cb = TimeMonitor(data_size=dataset.get_dataset_size()) ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps, keep_checkpoint_max=config.keep_ckpt_max) rank_size = os.getenv('RANK_SIZE') callbacks = [] + callbacks.append(time_cb) if rank_size is not None and int(rank_size) > 1: loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank()) callbacks.append(loss_monitor)