From 659839a2500cd9717a676466ba7426006e517f89 Mon Sep 17 00:00:00 2001
From: linqingke <linqingke@huawei.com>
Date: Sat, 19 Sep 2020 10:44:55 +0800
Subject: [PATCH] densenet121 and mass fix.

---
 model_zoo/official/cv/densenet121/README.md   | 14 ++++----
 .../scripts/run_distribute_train.sh           | 13 ++++---
 .../official/cv/densenet121/src/config.py     |  2 +-
 .../cv/psenet/scripts/run_distribute_train.sh |  1 -
 model_zoo/official/cv/psenet/src/config.py    |  4 +--
 model_zoo/official/cv/psenet/src/dataset.py   | 36 ++++++++++++++++---
 model_zoo/official/cv/psenet/train.py         |  2 +-
 model_zoo/official/nlp/mass/train.py          |  4 ++-
 8 files changed, 55 insertions(+), 21 deletions(-)

diff --git a/model_zoo/official/cv/densenet121/README.md b/model_zoo/official/cv/densenet121/README.md
index ebaf8afd121..f78d67a3fcc 100644
--- a/model_zoo/official/cv/densenet121/README.md
+++ b/model_zoo/official/cv/densenet121/README.md
@@ -18,7 +18,7 @@
 - [Model Description](#model-description)
     - [Performance](#performance)  
         - [Training accuracy results](#training-accuracy-results)
-        - [Training performance results](#yraining-performance-results)
+        - [Training performance results](#training-performance-results)
 - [Description of Random Situation](#description-of-random-situation)
 - [ModelZoo Homepage](#modelzoo-homepage)
 
@@ -80,13 +80,13 @@ After installing MindSpore via the official website, you can start training and
 
   ```python
   # run training example
-  python train.py --data_dir /PATH/TO/DATASET --is_distributed 0> train.log 2>&1 & 
+  python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & 
   
   # run distributed training example
-  sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET
+  sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT
   
   # run evaluation example
-  python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & 
+  python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & 
   OR
   sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT
   ```
@@ -168,7 +168,7 @@ You can modify the training behaviour through the various flags in the `train.py
 - running on Ascend
 
   ```
-  python train.py --data_dir /PATH/TO/DATASET --is_distributed 0 > train.log 2>&1 & 
+  python train.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/PRETRAINED_CKPT --is_distributed 0 > train.log 2>&1 & 
   ```
   
   The python command above will run in the background, The log and model checkpoint will be generated in `output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows:
@@ -190,7 +190,7 @@ You can modify the training behaviour through the various flags in the `train.py
 - running on Ascend
 
   ```
-  sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET
+  sh scripts/run_distribute_train.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/PRETRAINED_CKPT
   ```
   
   The above shell script will run distribute training in the background. You can view the results log and model checkpoint through the file `train[X]/output/202x-xx-xx_time_xx_xx_xx/`. The loss value will be achieved as follows:
@@ -217,7 +217,7 @@ You can modify the training behaviour through the various flags in the `train.py
   running the command below for evaluation. 
   
   ```
-  python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT> eval.log 2>&1 & 
+  python eval.py --data_dir /PATH/TO/DATASET --pretrained /PATH/TO/CHECKPOINT > eval.log 2>&1 & 
   OR
   sh scripts/run_distribute_eval.sh 8 rank_table.json /PATH/TO/DATASET /PATH/TO/CHECKPOINT
   ```
diff --git a/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh b/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh
index 8a5aac8ab96..51f16649b16 100644
--- a/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh
+++ b/model_zoo/official/cv/densenet121/scripts/run_distribute_train.sh
@@ -16,8 +16,8 @@
 
 echo "=============================================================================================================="
 echo "Please run the scipt as: "
-echo "sh scipts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET"
-echo "for example: sh scipts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset"
+echo "sh scripts/run_distribute_train.sh DEVICE_NUM RANK_TABLE_FILE DATASET CKPT_FILE"
+echo "for example: sh scripts/run_distribute_train.sh 8 /data/hccl.json /path/to/dataset ckpt_file"
 echo "It is better to use absolute path."
 echo "================================================================================================================="
 
@@ -26,6 +26,7 @@ echo "After running the scipt, the network runs in the background. The log will
 export RANK_SIZE=$1
 export RANK_TABLE_FILE=$2
 DATASET=$3
+CKPT_FILE=$4
 
 for((i=0;i<RANK_SIZE;i++))
 do
@@ -38,8 +39,12 @@ do
     export RANK_ID=$i
     echo "start training for rank $i, device $DEVICE_ID"
     env > env.log
-    python train.py  \
-    --data_dir=$DATASET > log.txt 2>&1 &
+    if [ -f $CKPT_FILE ]
+    then
+      python train.py --data_dir=$DATASET --pretrained=$CKPT_FILE > log.txt 2>&1 &
+    else
+      python train.py --data_dir=$DATASET > log.txt 2>&1 &
+    fi
 
     cd ../
 done
diff --git a/model_zoo/official/cv/densenet121/src/config.py b/model_zoo/official/cv/densenet121/src/config.py
index b925ac7d94b..f90dea09bb3 100644
--- a/model_zoo/official/cv/densenet121/src/config.py
+++ b/model_zoo/official/cv/densenet121/src/config.py
@@ -37,7 +37,7 @@ config = ed({
     "label_smooth_factor": 0.1,
 
     "log_interval": 100,
-    "ckpt_interval": 2000,
+    "ckpt_interval": 50000,
     "ckpt_path": 'outputs/',
     "is_save_on_master": 1,
 
diff --git a/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh b/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh
index c48bb04a024..147a36610cb 100644
--- a/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh
+++ b/model_zoo/official/cv/psenet/scripts/run_distribute_train.sh
@@ -41,7 +41,6 @@ fi
 
 python ${current_exec_path}/src/generate_hccn_file.py
 
-ulimit -u unlimited
 export DEVICE_NUM=4
 export RANK_SIZE=4
 export RANK_TABLE_FILE=${current_exec_path}/rank_table_4p.json
diff --git a/model_zoo/official/cv/psenet/src/config.py b/model_zoo/official/cv/psenet/src/config.py
index e7d28b55734..8e969a8ac81 100644
--- a/model_zoo/official/cv/psenet/src/config.py
+++ b/model_zoo/official/cv/psenet/src/config.py
@@ -30,7 +30,7 @@ config = ed({
     'NECK_OUT_CHANNEL': 256,
 
     # dataset for train
-    "TRAIN_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/',
+    "TRAIN_ROOT_DIR": 'psenet/ic15/',
     "TRAIN_IS_TRANSFORM": True,
     "TRAIN_LONG_SIZE": 640,
     "TRAIN_DATASET_SIZE": 1000,
@@ -43,7 +43,7 @@ config = ed({
     "TRAIN_MODEL_SAVE_PATH": './checkpoints/',
 
     # dataset for test
-    "TEST_ROOT_DIR": '/autotest/lqk/modelzoo/psenet/ic15/',
+    "TEST_ROOT_DIR": 'psenet/ic15/',
     "TEST_DATASET_SIZE": 500,
     "TEST_BUFFER_SIZE": 4,
     "TEST_DROP_REMAINDER": False,
diff --git a/model_zoo/official/cv/psenet/src/dataset.py b/model_zoo/official/cv/psenet/src/dataset.py
index 42d29b5f7b3..df373174f4f 100644
--- a/model_zoo/official/cv/psenet/src/dataset.py
+++ b/model_zoo/official/cv/psenet/src/dataset.py
@@ -16,6 +16,7 @@
 
 import os
 import random
+import math
 import cv2
 import pyclipper
 import numpy as np
@@ -298,13 +299,40 @@ def IC15_TEST_Generator():
 
         yield img, img_resized, img_name
 
-def train_dataset_creator():
+class DistributedSampler():
+    def __init__(self, dataset, rank, group_size, shuffle=True, seed=0):
+        self.dataset = dataset
+        self.rank = rank
+        self.group_size = group_size
+        self.dataset_len = len(self.dataset)
+        self.num_samplers = int(math.ceil(self.dataset_len * 1.0 / self.group_size))
+        self.total_size = self.num_samplers * self.group_size
+        self.shuffle = shuffle
+        self.seed = seed
+
+    def __iter__(self):
+        if self.shuffle:
+            self.seed = (self.seed + 1) & 0xffffffff
+            np.random.seed(self.seed)
+            indices = np.random.permutation(self.dataset_len).tolist()
+        else:
+            indices = list(range(len(self.dataset_len)))
+
+        indices += indices[:(self.total_size - len(indices))]
+        indices = indices[self.rank::self.group_size]
+        return iter(indices)
+
+    def __len__(self):
+        return self.num_samplers
+
+def train_dataset_creator(rank, group_size, shuffle=True):
     cv2.setNumThreads(0)
     dataset = TrainDataset()
-    ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8)
-    #ds = ds.repeat(config.TRAIN_REPEAT_NUM)
+    sampler = DistributedSampler(dataset, rank, group_size, shuffle)
+    ds = de.GeneratorDataset(dataset, ['img', 'gt_text', 'gt_kernels', 'training_mask'], num_parallel_workers=8,
+                             sampler=sampler)
+    ds = ds.repeat(1)
     ds = ds.batch(config.TRAIN_BATCH_SIZE, drop_remainder=config.TRAIN_DROP_REMAINDER)
-    ds = ds.shuffle(buffer_size=config.TRAIN_BUFFER_SIZE)
     return ds
 
 def test_dataset_creator():
diff --git a/model_zoo/official/cv/psenet/train.py b/model_zoo/official/cv/psenet/train.py
index 897e21d54e9..513e284950e 100644
--- a/model_zoo/official/cv/psenet/train.py
+++ b/model_zoo/official/cv/psenet/train.py
@@ -54,7 +54,7 @@ def train():
         rank_id = get_rank()
 
     # dataset/network/criterion/optim
-    ds = train_dataset_creator()
+    ds = train_dataset_creator(args.device_id, args.device_num)
     step_size = ds.get_dataset_size()
     print('Create dataset done!')
 
diff --git a/model_zoo/official/nlp/mass/train.py b/model_zoo/official/nlp/mass/train.py
index d0f93c46840..9b6a9e5ae14 100644
--- a/model_zoo/official/nlp/mass/train.py
+++ b/model_zoo/official/nlp/mass/train.py
@@ -25,7 +25,7 @@ from mindspore.nn import Momentum
 from mindspore.nn.optim import Adam, Lamb
 from mindspore.train.model import Model
 from mindspore.train.loss_scale_manager import DynamicLossScaleManager, FixedLossScaleManager
-from mindspore.train.callback import CheckpointConfig, ModelCheckpoint
+from mindspore.train.callback import CheckpointConfig, ModelCheckpoint, TimeMonitor
 from mindspore import context, Parameter
 from mindspore.context import ParallelMode
 from mindspore.communication import management as MultiAscend
@@ -216,11 +216,13 @@ def _build_training_pipeline(config: TransformerConfig,
                                                               scale_update_cell=scale_manager.get_update_cell())
     net_with_grads.set_train(True)
     model = Model(net_with_grads)
+    time_cb = TimeMonitor(data_size=dataset.get_dataset_size())
     ckpt_config = CheckpointConfig(save_checkpoint_steps=config.save_ckpt_steps,
                                    keep_checkpoint_max=config.keep_ckpt_max)
 
     rank_size = os.getenv('RANK_SIZE')
     callbacks = []
+    callbacks.append(time_cb)
     if rank_size is not None and int(rank_size) > 1:
         loss_monitor = LossCallBack(config, rank_id=MultiAscend.get_rank())
         callbacks.append(loss_monitor)