From 95ccebb81111dc82ad0e749f2f62fa925f0b87b1 Mon Sep 17 00:00:00 2001
From: shibeiji <shibeiji@huawei.com>
Date: Wed, 3 Feb 2021 10:47:28 +0800
Subject: [PATCH] fix bugs of MSRA dataset preprocessing

---
 model_zoo/official/nlp/bert/README.md         | 11 +++++-----
 model_zoo/official/nlp/bert/README_CN.md      | 11 +++++-----
 model_zoo/official/nlp/bert/export.py         |  3 +--
 model_zoo/official/nlp/bert/run_ner.py        |  3 +--
 .../official/nlp/bert/scripts/run_ner.sh      |  1 -
 .../nlp/bert/src/finetune_data_preprocess.py  | 21 ++++++++++---------
 6 files changed, 23 insertions(+), 27 deletions(-)

diff --git a/model_zoo/official/nlp/bert/README.md b/model_zoo/official/nlp/bert/README.md
index 910a7e6d024..af1e717136b 100644
--- a/model_zoo/official/nlp/bert/README.md
+++ b/model_zoo/official/nlp/bert/README.md
@@ -292,11 +292,10 @@ options:
     --use_crf                         whether to use crf to calculate loss: true | false
     --device_id                       device id to run task
     --epoch_num                       total number of training epochs to perform
-    --num_class                       number of classes to do labeling
     --train_data_shuffle              Enable train data shuffle, default is true
     --eval_data_shuffle               Enable eval data shuffle, default is true
     --vocab_file_path                 the vocabulary file that the BERT model was trained on
-    --label2id_file_path              label to id json file
+    --label2id_file_path              label to id file, each label name must be consistent with the type name labeled in the original dataset file
     --save_finetune_checkpoint_path   path to save generated finetuning checkpoint
     --load_pretrain_checkpoint_path   initial checkpoint (usually from a pre-trained BERT model)
     --load_finetune_checkpoint_path   give a finetuning checkpoint path if only do eval
@@ -533,7 +532,7 @@ acc_num XXX, total_num XXX, accuracy 0.588986
 #### evaluation on cluener dataset when running on Ascend
 
 ```bash
-bash scripts/ner.sh
+bash scripts/run_ner.sh
 ```
 
 The command above will run in the background, you can view training logs in ner_log.txt.
@@ -548,16 +547,16 @@ F1 0.920507
 
 #### evaluation on msra dataset when running on Ascend
 
-For preprocess, you can first convert the original txt format of MSRA dataset into mindrecord by run the command as below:
+For preprocess, you can first convert the original txt format of MSRA dataset into mindrecord by run the command as below (please keep in mind that the label names in label2id_file should be consistent with the type names labeled in the original msra_dataset.xml dataset file):
 
 ```python
-python src/finetune_data_preprocess.py ----data_dir=/path/msra_dataset.txt --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
+python src/finetune_data_preprocess.py --data_dir=/path/msra_dataset.xml --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
 ```
 
 For finetune and evaluation, just do
 
 ```bash
-bash scripts/ner.sh
+bash scripts/run_ner.sh
 ```
 
 The command above will run in the background, you can view training logs in ner_log.txt.
diff --git a/model_zoo/official/nlp/bert/README_CN.md b/model_zoo/official/nlp/bert/README_CN.md
index db407721e7c..4b9ca2f79c5 100644
--- a/model_zoo/official/nlp/bert/README_CN.md
+++ b/model_zoo/official/nlp/bert/README_CN.md
@@ -290,11 +290,10 @@ For example, the schema file of cn-wiki-128 dataset for pretraining shows as fol
     --use_crf                         是否采用CRF来计算损失，可选项为true或false
     --device_id                       任务运行的设备ID
     --epoch_num                       训练轮次总数
-    --num_class                       标注类的数量
     --train_data_shuffle              是否使能训练数据集轮换，默认为true
     --eval_data_shuffle               是否使能评估数据集轮换，默认为true
     --vocab_file_path                 BERT模型训练的词汇表
-    --label2id_file_path              标注转ID的JSON文件
+    --label2id_file_path              标注文件，文件中的标注名称必须与原始数据集中所标注的类型名称完全一致
     --save_finetune_checkpoint_path   保存生成微调检查点的路径
     --load_pretrain_checkpoint_path   初始检查点（通常来自预训练BERT模型
     --load_finetune_checkpoint_path   如仅执行评估，提供微调检查点保存路径
@@ -497,7 +496,7 @@ acc_num XXX, total_num XXX, accuracy 0.588986
 #### Ascend处理器上运行后评估cluener数据集
 
 ```bash
-bash scripts/ner.sh
+bash scripts/run_ner.sh
 ```
 
 以上命令后台运行，您可以在ner_log.txt中查看训练日志。
@@ -512,16 +511,16 @@ F1 0.920507
 
 #### Ascend处理器上运行后评估msra数据集
 
-您可以采用如下方式，先将MSRA数据集的原始格式在预处理流程中转换为mindrecord格式以提升性能：
+您可以采用如下方式，先将MSRA数据集的原始格式在预处理流程中转换为mindrecord格式以提升性能 (请注意label2id_file文件中的标注名称应与数据集msra_dataset.xml文件中的标注名保持一致)：
 
 ```python
-python src/finetune_data_preprocess.py ----data_dir=/path/msra_dataset.txt --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
+python src/finetune_data_preprocess.py --data_dir=/path/msra_dataset.xml --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
 ```
 
 此后，您可以进行微调再训练和推理流程，
 
 ```bash
-bash scripts/ner.sh
+bash scripts/run_ner.sh
 ```
 
 以上命令后台运行，您可以在ner_log.txt中查看训练日志。
diff --git a/model_zoo/official/nlp/bert/export.py b/model_zoo/official/nlp/bert/export.py
index ec2ba210b67..37c48a8eb89 100644
--- a/model_zoo/official/nlp/bert/export.py
+++ b/model_zoo/official/nlp/bert/export.py
@@ -29,7 +29,6 @@ parser.add_argument("--device_id", type=int, default=0, help="Device id")
 parser.add_argument("--use_crf", type=str, default="false", help="Use cfg, default is false.")
 parser.add_argument("--downstream_task", type=str, choices=["NER", "CLS", "SQUAD"], default="NER",
                     help="at present，support NER only")
-parser.add_argument("--num_class", type=int, default=41, help="The number of class, default is 41.")
 parser.add_argument("--batch_size", type=int, default=16, help="batch size")
 parser.add_argument("--label_file_path", type=str, default="", help="label file path, used in clue benchmark.")
 parser.add_argument("--ckpt_file", type=str, required=True, help="Bert ckpt file.")
@@ -56,7 +55,7 @@ if args.use_crf.lower() == "true":
     tag_to_index["<STOP>"] = max_val + 2
     number_labels = len(tag_to_index)
 else:
-    number_labels = args.num_class
+    number_labels = len(tag_to_index)
 
 if __name__ == "__main__":
     if args.downstream_task == "NER":
diff --git a/model_zoo/official/nlp/bert/run_ner.py b/model_zoo/official/nlp/bert/run_ner.py
index c0324a87dc5..e527b52e0f3 100644
--- a/model_zoo/official/nlp/bert/run_ner.py
+++ b/model_zoo/official/nlp/bert/run_ner.py
@@ -161,7 +161,6 @@ def parse_args():
                         help="Use crf, default is false")
     parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
     parser.add_argument("--epoch_num", type=int, default=5, help="Epoch number, default is 5.")
-    parser.add_argument("--num_class", type=int, default=41, help="The number of class, default is 41.")
     parser.add_argument("--train_data_shuffle", type=str, default="true", choices=["true", "false"],
                         help="Enable train data shuffle, default is true")
     parser.add_argument("--eval_data_shuffle", type=str, default="false", choices=["true", "false"],
@@ -228,7 +227,7 @@ def run_ner():
         tag_to_index["<STOP>"] = max_val + 2
         number_labels = len(tag_to_index)
     else:
-        number_labels = args_opt.num_class
+        number_labels = len(tag_to_index)
     if args_opt.do_train.lower() == "true":
         netwithloss = BertNER(bert_net_cfg, args_opt.train_batch_size, True, num_labels=number_labels,
                               use_crf=(args_opt.use_crf.lower() == "true"),
diff --git a/model_zoo/official/nlp/bert/scripts/run_ner.sh b/model_zoo/official/nlp/bert/scripts/run_ner.sh
index d108be3f86a..13b9ce82e92 100644
--- a/model_zoo/official/nlp/bert/scripts/run_ner.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_ner.sh
@@ -34,7 +34,6 @@ python ${PROJECT_DIR}/../run_ner.py  \
     --use_crf="false" \
     --device_id=0 \
     --epoch_num=5 \
-    --num_class=41 \
     --train_data_shuffle="true" \
     --eval_data_shuffle="false" \
     --train_batch_size=32 \
diff --git a/model_zoo/official/nlp/bert/src/finetune_data_preprocess.py b/model_zoo/official/nlp/bert/src/finetune_data_preprocess.py
index 84269982e0b..83480c819c4 100755
--- a/model_zoo/official/nlp/bert/src/finetune_data_preprocess.py
+++ b/model_zoo/official/nlp/bert/src/finetune_data_preprocess.py
@@ -140,7 +140,7 @@ def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage
     return dataset
 
 
-def process_cluener_msra(data_file, class_filter=None, split_begin=None, split_end=None):
+def process_msra(data_file, class_filter=None, split_begin=None, split_end=None):
     """
     Data pre-process for MSRA dataset
     Args:
@@ -188,11 +188,11 @@ def process_cluener_msra(data_file, class_filter=None, split_begin=None, split_e
         yield (np.array("".join(content)), np.array(list(tags)))
 
 
-def process_msra_clue_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None,
-                              split_begin=None, split_end=None):
+def process_ner_msra_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None,
+                             split_begin=None, split_end=None):
     """Process MSRA dataset"""
     ### Loading MSRA from CLUEDataset
-    dataset = ds.GeneratorDataset(process_cluener_msra(data_dir, class_filter, split_begin, split_end),
+    dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter, split_begin, split_end),
                                   column_names=['text', 'label'])
 
     ### Processing label
@@ -230,13 +230,14 @@ if __name__ == "__main__":
     parser.add_argument("--vocab_file", type=str, default="", help="Vocab file path")
     parser.add_argument("--save_path", type=str, default="./my.mindrecord", help="Path to save mindrecord")
     parser.add_argument("--label2id", type=str, default="",
-                        help="Label2id file path, must be set for cluener2020 task")
+                        help="Label2id file path, please keep in mind that each label name should be consistent with"
+                             "the type name labeled in the oroginal dataset file")
     parser.add_argument("--max_seq_len", type=int, default=128, help="Sequence length")
     parser.add_argument("--class_filter", nargs='*', help="Specified classes will be counted, if empty all in counted")
-    parser.add_argument("--split_begin", type=float, default=None, help="Specified subsets of date will be counted,"
+    parser.add_argument("--split_begin", type=float, default=None, help="Specified subsets of data will be counted,"
                         "if not None, the data will counted begin from split_begin")
-    parser.add_argument("--split_end", type=float, default=None, help="Specified subsets of date will be counted,"
-                        "if not None, the data will counted before split_before")
+    parser.add_argument("--split_end", type=float, default=None, help="Specified subsets of data will be counted,"
+                        "if not None, the data before split_end will be counted ")
 
     args_opt = parser.parse_args()
     if args_opt.label2id == "":
@@ -246,6 +247,6 @@ if __name__ == "__main__":
         for tag in f:
             labels_list.append(tag.strip())
     tag_to_index = list(convert_labels_to_index(labels_list).keys())
-    ds = process_msra_clue_dataset(args_opt.data_dir, tag_to_index, args_opt.vocab_file, args_opt.max_seq_len,
-                                   args_opt.class_filter, args_opt.split_begin, args_opt.split_end)
+    ds = process_ner_msra_dataset(args_opt.data_dir, tag_to_index, args_opt.vocab_file, args_opt.max_seq_len,
+                                  args_opt.class_filter, args_opt.split_begin, args_opt.split_end)
     ds.save(args_opt.save_path)