fix bugs of MSRA dataset preprocessing

This commit is contained in:
shibeiji 2021-02-03 10:47:28 +08:00
parent b2fb825d89
commit 95ccebb811
6 changed files with 23 additions and 27 deletions

View File

@ -292,11 +292,10 @@ options:
--use_crf whether to use crf to calculate loss: true | false
--device_id device id to run task
--epoch_num total number of training epochs to perform
--num_class number of classes to do labeling
--train_data_shuffle Enable train data shuffle, default is true
--eval_data_shuffle Enable eval data shuffle, default is true
--vocab_file_path the vocabulary file that the BERT model was trained on
--label2id_file_path label to id json file
--label2id_file_path label to id file, each label name must be consistent with the type name labeled in the original dataset file
--save_finetune_checkpoint_path path to save generated finetuning checkpoint
--load_pretrain_checkpoint_path initial checkpoint (usually from a pre-trained BERT model)
--load_finetune_checkpoint_path give a finetuning checkpoint path if only do eval
@ -533,7 +532,7 @@ acc_num XXX, total_num XXX, accuracy 0.588986
#### evaluation on cluener dataset when running on Ascend
```bash
bash scripts/ner.sh
bash scripts/run_ner.sh
```
The command above will run in the background, you can view training logs in ner_log.txt.
@ -548,16 +547,16 @@ F1 0.920507
#### evaluation on msra dataset when running on Ascend
For preprocess, you can first convert the original txt format of MSRA dataset into mindrecord by run the command as below:
For preprocess, you can first convert the original txt format of MSRA dataset into mindrecord by run the command as below (please keep in mind that the label names in label2id_file should be consistent with the type names labeled in the original msra_dataset.xml dataset file):
```python
python src/finetune_data_preprocess.py ----data_dir=/path/msra_dataset.txt --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
python src/finetune_data_preprocess.py --data_dir=/path/msra_dataset.xml --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
```
For finetune and evaluation, just do
```bash
bash scripts/ner.sh
bash scripts/run_ner.sh
```
The command above will run in the background, you can view training logs in ner_log.txt.

View File

@ -290,11 +290,10 @@ For example, the schema file of cn-wiki-128 dataset for pretraining shows as fol
--use_crf 是否采用CRF来计算损失可选项为true或false
--device_id 任务运行的设备ID
--epoch_num 训练轮次总数
--num_class 标注类的数量
--train_data_shuffle 是否使能训练数据集轮换默认为true
--eval_data_shuffle 是否使能评估数据集轮换默认为true
--vocab_file_path BERT模型训练的词汇表
--label2id_file_path 标注转ID的JSON文件
--label2id_file_path 标注文件,文件中的标注名称必须与原始数据集中所标注的类型名称完全一致
--save_finetune_checkpoint_path 保存生成微调检查点的路径
--load_pretrain_checkpoint_path 初始检查点通常来自预训练BERT模型
--load_finetune_checkpoint_path 如仅执行评估,提供微调检查点保存路径
@ -497,7 +496,7 @@ acc_num XXX, total_num XXX, accuracy 0.588986
#### Ascend处理器上运行后评估cluener数据集
```bash
bash scripts/ner.sh
bash scripts/run_ner.sh
```
以上命令后台运行您可以在ner_log.txt中查看训练日志。
@ -512,16 +511,16 @@ F1 0.920507
#### Ascend处理器上运行后评估msra数据集
您可以采用如下方式先将MSRA数据集的原始格式在预处理流程中转换为mindrecord格式以提升性能
您可以采用如下方式先将MSRA数据集的原始格式在预处理流程中转换为mindrecord格式以提升性能 (请注意label2id_file文件中的标注名称应与数据集msra_dataset.xml文件中的标注名保持一致)
```python
python src/finetune_data_preprocess.py ----data_dir=/path/msra_dataset.txt --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
python src/finetune_data_preprocess.py --data_dir=/path/msra_dataset.xml --vocab_file=/path/vacab_file --save_path=/path/msra_dataset.mindrecord --label2id=/path/label2id_file --max_seq_len=seq_len --class_filter="NAMEX" --split_begin=0.0 --split_end=1.0
```
此后,您可以进行微调再训练和推理流程,
```bash
bash scripts/ner.sh
bash scripts/run_ner.sh
```
以上命令后台运行您可以在ner_log.txt中查看训练日志。

View File

@ -29,7 +29,6 @@ parser.add_argument("--device_id", type=int, default=0, help="Device id")
parser.add_argument("--use_crf", type=str, default="false", help="Use cfg, default is false.")
parser.add_argument("--downstream_task", type=str, choices=["NER", "CLS", "SQUAD"], default="NER",
help="at presentsupport NER only")
parser.add_argument("--num_class", type=int, default=41, help="The number of class, default is 41.")
parser.add_argument("--batch_size", type=int, default=16, help="batch size")
parser.add_argument("--label_file_path", type=str, default="", help="label file path, used in clue benchmark.")
parser.add_argument("--ckpt_file", type=str, required=True, help="Bert ckpt file.")
@ -56,7 +55,7 @@ if args.use_crf.lower() == "true":
tag_to_index["<STOP>"] = max_val + 2
number_labels = len(tag_to_index)
else:
number_labels = args.num_class
number_labels = len(tag_to_index)
if __name__ == "__main__":
if args.downstream_task == "NER":

View File

@ -161,7 +161,6 @@ def parse_args():
help="Use crf, default is false")
parser.add_argument("--device_id", type=int, default=0, help="Device id, default is 0.")
parser.add_argument("--epoch_num", type=int, default=5, help="Epoch number, default is 5.")
parser.add_argument("--num_class", type=int, default=41, help="The number of class, default is 41.")
parser.add_argument("--train_data_shuffle", type=str, default="true", choices=["true", "false"],
help="Enable train data shuffle, default is true")
parser.add_argument("--eval_data_shuffle", type=str, default="false", choices=["true", "false"],
@ -228,7 +227,7 @@ def run_ner():
tag_to_index["<STOP>"] = max_val + 2
number_labels = len(tag_to_index)
else:
number_labels = args_opt.num_class
number_labels = len(tag_to_index)
if args_opt.do_train.lower() == "true":
netwithloss = BertNER(bert_net_cfg, args_opt.train_batch_size, True, num_labels=number_labels,
use_crf=(args_opt.use_crf.lower() == "true"),

View File

@ -34,7 +34,6 @@ python ${PROJECT_DIR}/../run_ner.py \
--use_crf="false" \
--device_id=0 \
--epoch_num=5 \
--num_class=41 \
--train_data_shuffle="true" \
--eval_data_shuffle="false" \
--train_batch_size=32 \

View File

@ -140,7 +140,7 @@ def process_cmnli_clue_dataset(data_dir, label_list, bert_vocab_path, data_usage
return dataset
def process_cluener_msra(data_file, class_filter=None, split_begin=None, split_end=None):
def process_msra(data_file, class_filter=None, split_begin=None, split_end=None):
"""
Data pre-process for MSRA dataset
Args:
@ -188,11 +188,11 @@ def process_cluener_msra(data_file, class_filter=None, split_begin=None, split_e
yield (np.array("".join(content)), np.array(list(tags)))
def process_msra_clue_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None,
split_begin=None, split_end=None):
def process_ner_msra_dataset(data_dir, label_list, bert_vocab_path, max_seq_len=128, class_filter=None,
split_begin=None, split_end=None):
"""Process MSRA dataset"""
### Loading MSRA from CLUEDataset
dataset = ds.GeneratorDataset(process_cluener_msra(data_dir, class_filter, split_begin, split_end),
dataset = ds.GeneratorDataset(process_msra(data_dir, class_filter, split_begin, split_end),
column_names=['text', 'label'])
### Processing label
@ -230,13 +230,14 @@ if __name__ == "__main__":
parser.add_argument("--vocab_file", type=str, default="", help="Vocab file path")
parser.add_argument("--save_path", type=str, default="./my.mindrecord", help="Path to save mindrecord")
parser.add_argument("--label2id", type=str, default="",
help="Label2id file path, must be set for cluener2020 task")
help="Label2id file path, please keep in mind that each label name should be consistent with"
"the type name labeled in the oroginal dataset file")
parser.add_argument("--max_seq_len", type=int, default=128, help="Sequence length")
parser.add_argument("--class_filter", nargs='*', help="Specified classes will be counted, if empty all in counted")
parser.add_argument("--split_begin", type=float, default=None, help="Specified subsets of date will be counted,"
parser.add_argument("--split_begin", type=float, default=None, help="Specified subsets of data will be counted,"
"if not None, the data will counted begin from split_begin")
parser.add_argument("--split_end", type=float, default=None, help="Specified subsets of date will be counted,"
"if not None, the data will counted before split_before")
parser.add_argument("--split_end", type=float, default=None, help="Specified subsets of data will be counted,"
"if not None, the data before split_end will be counted ")
args_opt = parser.parse_args()
if args_opt.label2id == "":
@ -246,6 +247,6 @@ if __name__ == "__main__":
for tag in f:
labels_list.append(tag.strip())
tag_to_index = list(convert_labels_to_index(labels_list).keys())
ds = process_msra_clue_dataset(args_opt.data_dir, tag_to_index, args_opt.vocab_file, args_opt.max_seq_len,
args_opt.class_filter, args_opt.split_begin, args_opt.split_end)
ds = process_ner_msra_dataset(args_opt.data_dir, tag_to_index, args_opt.vocab_file, args_opt.max_seq_len,
args_opt.class_filter, args_opt.split_begin, args_opt.split_end)
ds.save(args_opt.save_path)