!6534 densenet121 and mass fix
Merge pull request !6534 from linqingke/mass
This commit is contained in:
commit
daada0303c
|
@ -18,7 +18,7 @@
|
|||
- [Model Description](#model-description)
|
||||
- [Performance](#performance)
|
||||
- [Training accuracy results](#training-accuracy-results)
|
||||
- [Training performance results](#yraining-performance-results)
|
||||
- [Training performance results](#training-performance-results)
|
||||
- [Description of Random Situation](#description-of-random-situation)
|
||||
- [ModelZoo Homepage](#modelzoo-homepage)
|
||||
|
||||
|
@ -80,13 +80,13 @@ After installing MindSpore via the official website, you can start training and
|
|||
|
||||
```python
|
||||
# run training example
|
||||
python train.py --data_dir /PATH/TO/DATASET --is_distributed 0> train.log 2>&1 &
|
||||
python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 &
|
||||
|
||||
# run distributed training example
|
||||
sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET
|
||||
sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT
|
||||
|
||||
# run evaluation example
|
||||
python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 &
|
||||
python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 &
|
||||
OR
|
||||
sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT
|
||||
```
|
||||
|
@ -168,7 +168,7 @@ You can modify the training behaviour through the various flags in the `train.py
|
|||
- running on Ascend
|
||||
|
||||
```
|
||||
python train.py --data_dir /PATH/TO/DATASET --is_distributed 0 > train.log 2>&1 &
|
||||
python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 &
|
||||
```
|
||||
|
||||
The python command above will run in the background, The log and model checkpoint will be generated in `output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows:
|
||||
|
@ -190,7 +190,7 @@ You can modify the training behaviour through the various flags in the `train.py
|
|||
- running on Ascend
|
||||
|
||||
```
|
||||
sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET
|
||||
sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT
|
||||
```
|
||||
|
||||
The above shell script will run distribute training in the background. You can view the results log and model checkpoint through the file `train[X]/output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows:
|
||||
|
@ -217,7 +217,7 @@ You can modify the training behaviour through the various flags in the `train.py
|
|||
running the command below for evaluation.
|
||||
|
||||
```
|
||||
python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 &
|
||||
python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 &
|
||||
OR
|
||||
sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT
|
||||
```
|
||||
|
|
|
@ -16,8 +16,8 @@
|
|||
|
||||
echo "=============================================================================================================="
|
||||
echo "Please run the scipt as: "
|
||||
echo "sh scipts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET"
|
||||
echo "for example: sh scipts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset"
|
||||
echo "sh scripts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET CKPT_FILE"
|
||||
echo "for example: sh scripts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset ckpt_file"
|
||||
echo "It is better to use absolute path."
|
||||
echo "================================================================================================================="
|
||||
|
||||
|
@ -26,6 +26,7 @@ echo "After running the scipt, the network runs in the background. The log will
|
|||
export RANK_SIZE=$1
|
||||
export RANK_TABLE_FILE=$2
|
||||
DATASET=$3
|
||||
CKPT_FILE=$4
|
||||
|
||||
for((i=0;i<RANK_SIZE;i++))
|
||||
do
|
||||
|
@ -38,8 +39,12 @@ do
|
|||
export RANK_ID=$i
|
||||
echo "start training for rank $i, device $DEVICE_ID"
|
||||
env > env.log
|
||||
python train.py \
|
||||
--data_dir=$DATASET > log.txt 2>&1 &
|
||||
if [ -f $CKPT_FILE ]
|
||||
then
|
||||
python train.py --data_dir=$DATASET --pretrained=$CKPT_FILE > log.txt 2>&1 &
|
||||
else
|
||||
python train.py --data_dir=$DATASET > log.txt 2>&1 &
|
||||
fi
|
||||
|
||||
cd ../
|
||||
done
|
||||
|
|
|
@ -37,7 +37,7 @@ config = ed({
|
|||
"label_smooth_factor": 0.1,
|
||||
|
||||
"log_interval": 100,
|
||||
"ckpt_interval": 2000,
|
||||
"ckpt_interval": 50000,
|
||||
"ckpt_path": 'outputs/',
|
||||
"is_save_on_master": 1,
|
||||
|
||||
|
|
|
@ -41,7 +41,6 @@ fi
|
|||
|
||||
python ${current_exec_path}/src/generate_hccn_file.py
|
||||
|
||||
ulimit -u unlimited
|
||||
export DEVICE_NUM=4
|
||||
export RANK_SIZE=4
|
||||
export RANK_TABLE_FILE=${current_exec_path}/rank_table_4p.json
|
||||
|
|
|
@ -30,7 +30,7 @@ config = ed({
|
|||
'NECK_OUT_CHANNEL': 256,
|
||||
|
||||
# dataset for train
|
||||
"TRAIN_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/',
|
||||
"TRAIN_ROOT_DIR": 'psenet/ic15/',
|
||||
"TRAIN_IS_TRANSFORM": True,
|
||||
"TRAIN_LONG_SIZE": 640,
|
||||
"TRAIN_DATASET_SIZE": 1000,
|
||||
|
@ -43,7 +43,7 @@ config = ed({
|
|||
"TRAIN_MODEL_SAVE_PATH": './checkpoints/',
|
||||
|
||||
# dataset for test
|
||||
"TEST_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/',
|
||||
"TEST_ROOT_DIR": 'psenet/ic15/',
|
||||
"TEST_DATASET_SIZE": 500,
|
||||
"TEST_BUFFER_SIZE": 4,
|
||||
"TEST_DROP_REMAINDER": False,
|
||||
|
|
|
@ -16,6 +16,7 @@
|
|||
|
||||
import os
|
||||
import random
|
||||
import math
|
||||
import cv2
|
||||
import pyclipper
|
||||
import numpy as np
|
||||
|
@ -298,13 +299,40 @@ def IC15_TEST_Generator():
|
|||
|
||||
yield img, img_resized, img_name
|
||||
|
||||
def train_dataset_creator():
|
||||
class DistributedSampler():
|
||||
def __init__(self, dataset, rank, group_size, shuffle=True, seed=0):
|
||||
self.dataset = dataset
|
||||
self.rank = rank
|
||||
self.group_size = group_size
|
||||
self.dataset_len = len(self.dataset)
|
||||
self.num_samplers = int(math.ceil(self.dataset_len * 1.0 / self.group_size))
|
||||
self.total_size = self.num_samplers * self.group_size
|
||||
self.shuffle = shuffle
|
||||
self.seed = seed
|
||||
|
||||
def __iter__(self):
|
||||
if self.shuffle:
|
||||
self.seed = (self.seed + 1) & 0xffffffff
|
||||
np.random.seed(self.seed)
|
||||
indices = np.random.permutation(self.dataset_len).tolist()
|
||||
else:
|
||||
indices = list(range(len(self.dataset_len)))
|
||||
|
||||
indices += indices[:(self.total_size - len(indices))]
|
||||
indices = indices[self.rank::self.group_size]
|
||||
return iter(indices)
|
||||
|
||||
def __len__(self):
|
||||
return self.num_samplers
|
||||
|
||||
def train_dataset_creator(rank, group_size, shuffle=True):
|
||||
cv2.setNumThreads(0)
|
||||
dataset = TrainDataset()
|
||||
ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8)
|
||||
#ds = ds.repeat(config.TRAIN_REPEAT_NUM)
|
||||
sampler = DistributedSampler(dataset, rank, group_size, shuffle)
|
||||
ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
|
||||
sampler=sampler)
|
||||
ds = ds.repeat(1)
|
||||
ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
|
||||
ds = ds.shuffle(buffer_size=config.TRAIN_BUFFER_SIZE)
|
||||
return ds
|
||||
|
||||
def test_dataset_creator():
|
||||
|
|
|
@ -54,7 +54,7 @@ def train():
|
|||
rank_id = get_rank()
|
||||
|
||||
# dataset/network/criterion/optim
|
||||
ds = train_dataset_creator()
|
||||
ds = train_dataset_creator(args.device_id, args.device_num)
|
||||
step_size = ds.get_dataset_size()
|
||||
print('Create dataset done!')
|
||||
|
||||
|
|
|
@ -25,7 +25,7 @@ from mindspore.nn import Momentum
|
|||
from mindspore.nn.optim import Adam, Lamb
|
||||
from mindspore.train.model import Model
|
||||
from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
|
||||
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
|
||||
from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor
|
||||
from mindspore import context, Parameter
|
||||
from mindspore.context import ParallelMode
|
||||
from mindspore.communication import management as MultiAscend
|
||||
|
@ -216,11 +216,13 @@ def _build_training_pipeline(config: TransformerConfig,
|
|||
scale_update_cell=scale_manager.get_update_cell())
|
||||
net_with_grads.set_train(True)
|
||||
model = Model(net_with_grads)
|
||||
time_cb = TimeMonitor(data_size=dataset.get_dataset_size())
|
||||
ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps,
|
||||
keep_checkpoint_max=config.keep_ckpt_max)
|
||||
|
||||
rank_size = os.getenv('RANK_SIZE')
|
||||
callbacks = []
|
||||
callbacks.append(time_cb)
|
||||
if rank_size is not None and int(rank_size) > 1:
|
||||
loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank())
|
||||
callbacks.append(loss_monitor)
|
||||
|
|
Loading…
Reference in New Issue