!6616 add tokenization and score file

Merge pull request !6616 from yoonlee666/token
2020-09-22 10:37:31 +08:00 · 2020-09-22 10:37:31 +08:00 · b309850036
parent b637fb4554 01c9e8b373
commit b309850036
7 changed files with 457 additions and 28 deletions
--- a/model_zoo/official/nlp/bert/run_ner.py
+++ b/model_zoo/official/nlp/bert/run_ner.py
@ -18,12 +18,11 @@ Bert finetune and evaluation script.
 '''

 import os
-import json
 import argparse
 from src.bert_for_finetune import BertFinetuneCell, BertNER
 from src.finetune_eval_config import optimizer_cfg, bert_net_cfg
 from src.dataset import create_ner_dataset
-from src.utils import make_directory, LossCallBack, LoadNewestCkpt, BertLearningRate
+from src.utils import make_directory, LossCallBack, LoadNewestCkpt, BertLearningRate, convert_labels_to_index
 from src.assessment_method import Accuracy, F1, MCC, Spearman_Correlation
 import mindspore.common.dtype as mstype
 from mindspore import context
@ -99,7 +98,7 @@ def eval_result_print(assessment_method="accuracy", callback=None):
        raise ValueError("Assessment method not supported, support: [accuracy, f1, mcc, spearman_correlation]")

 def do_eval(dataset=None, network=None, use_crf="", num_class=2, assessment_method="accuracy", data_file="",
-            load_checkpoint_path="", vocab_file="", label2id_file="", tag_to_index=None):
+            load_checkpoint_path="", vocab_file="", label_file="", tag_to_index=None):
    """ do eval """
    if load_checkpoint_path == "":
        raise ValueError("Finetune model missed, evaluation task must load finetune model!")
@ -114,7 +113,8 @@ def do_eval(dataset=None, network=None, use_crf="", num_class=2, assessment_meth

    if assessment_method == "clue_benchmark":
        from src.cluener_evaluation import submit
-        submit(model=model, path=data_file, vocab_file=vocab_file, use_crf=use_crf, label2id_file=label2id_file)
+        submit(model=model, path=data_file, vocab_file=vocab_file, use_crf=use_crf,
+               label_file=label_file, tag_to_index=tag_to_index)
    else:
        if assessment_method == "accuracy":
            callback = Accuracy()
@ -161,7 +161,7 @@ def parse_args():
    parser.add_argument("--eval_data_shuffle", type=str, default="false", choices=["true", "false"],
                        help="Enable eval data shuffle, default is false")
    parser.add_argument("--vocab_file_path", type=str, default="", help="Vocab file path, used in clue benchmark")
-    parser.add_argument("--label2id_file_path", type=str, default="", help="label2id file path, used in clue benchmark")
+    parser.add_argument("--label_file_path", type=str, default="", help="label file path, used in clue benchmark")
    parser.add_argument("--save_finetune_checkpoint_path", type=str, default="", help="Save checkpoint path")
    parser.add_argument("--load_pretrain_checkpoint_path", type=str, default="", help="Load checkpoint file path")
    parser.add_argument("--load_finetune_checkpoint_path", type=str, default="", help="Load checkpoint file path")
@ -180,10 +180,10 @@ def parse_args():
        raise ValueError("'eval_data_file_path' must be set when do evaluation task")
    if args_opt.assessment_method.lower() == "clue_benchmark" and args_opt.vocab_file_path == "":
        raise ValueError("'vocab_file_path' must be set to do clue benchmark")
-    if args_opt.use_crf.lower() == "true" and args_opt.label2id_file_path == "":
-        raise ValueError("'label2id_file_path' must be set to use crf")
-    if args_opt.assessment_method.lower() == "clue_benchmark" and args_opt.label2id_file_path == "":
-        raise ValueError("'label2id_file_path' must be set to do clue benchmark")
+    if args_opt.use_crf.lower() == "true" and args_opt.label_file_path == "":
+        raise ValueError("'label_file_path' must be set to use crf")
+    if args_opt.assessment_method.lower() == "clue_benchmark" and args_opt.label_file_path == "":
+        raise ValueError("'label_file_path' must be set to do clue benchmark")
    return args_opt


@ -205,11 +205,12 @@ def run_ner():
            bert_net_cfg.compute_type = mstype.float32
    else:
        raise Exception("Target error, GPU or Ascend is supported.")
-
-    tag_to_index = None
+    label_list = []
+    with open(args_opt.label_file_path) as f:
+        for label in f:
+            label_list.append(label.strip())
+    tag_to_index = convert_labels_to_index(label_list)
    if args_opt.use_crf.lower() == "true":
-        with open(args_opt.label2id_file_path) as json_file:
-            tag_to_index = json.load(json_file)
        max_val = max(tag_to_index.values())
        tag_to_index["<START>"] = max_val + 1
        tag_to_index["<STOP>"] = max_val + 2
@ -240,7 +241,7 @@ def run_ner():
                                schema_file_path=args_opt.schema_file_path,
                                do_shuffle=(args_opt.eval_data_shuffle.lower() == "true"))
        do_eval(ds, BertNER, args_opt.use_crf, number_labels, assessment_method, args_opt.eval_data_file_path,
-                load_finetune_checkpoint_path, args_opt.vocab_file_path, args_opt.label2id_file_path, tag_to_index)
+                load_finetune_checkpoint_path, args_opt.vocab_file_path, args_opt.label_file_path, tag_to_index)

 if __name__ == "__main__":
    run_ner()
--- a/model_zoo/official/nlp/bert/scripts/run_ner.sh
+++ b/model_zoo/official/nlp/bert/scripts/run_ner.sh
@ -38,7 +38,7 @@ python ${PROJECT_DIR}/../run_ner.py  \
    --train_data_shuffle="true" \
    --eval_data_shuffle="false" \
    --vocab_file_path="" \
-    --label2id_file_path="" \
+    --label_file_path="" \
    --save_finetune_checkpoint_path="" \
    --load_pretrain_checkpoint_path="" \
    --load_finetune_checkpoint_path="" \
--- a/model_zoo/official/nlp/bert/src/cluener_evaluation.py
+++ b/model_zoo/official/nlp/bert/src/cluener_evaluation.py
@ -23,9 +23,9 @@ from src import tokenization
 from src.sample_process import label_generation, process_one_example_p
 from src.CRF import postprocess
 from src.finetune_eval_config import bert_net_cfg
+from src.score import get_result

-
-def process(model=None, text="", tokenizer_=None, use_crf="", label2id_file=""):
+def process(model=None, text="", tokenizer_=None, use_crf="", tag_to_index=None, vocab=""):
    """
    process text.
    """
@ -34,7 +34,7 @@ def process(model=None, text="", tokenizer_=None, use_crf="", label2id_file=""):
    res = []
    ids = []
    for i in data:
-        feature = process_one_example_p(tokenizer_, i, max_seq_len=bert_net_cfg.seq_length)
+        feature = process_one_example_p(tokenizer_, vocab, i, max_seq_len=bert_net_cfg.seq_length)
        features.append(feature)
        input_ids, input_mask, token_type_id = feature
        input_ids = Tensor(np.array(input_ids), mstype.int32)
@ -52,10 +52,10 @@ def process(model=None, text="", tokenizer_=None, use_crf="", label2id_file=""):
            ids = logits.asnumpy()
            ids = np.argmax(ids, axis=-1)
            ids = list(ids)
-    res = label_generation(text=text, probs=ids, label2id_file=label2id_file)
+    res = label_generation(text=text, probs=ids, tag_to_index=tag_to_index)
    return res

-def submit(model=None, path="", vocab_file="", use_crf="", label2id_file=""):
+def submit(model=None, path="", vocab_file="", use_crf="", label_file="", tag_to_index=None):
    """
    submit task
    """
@ -66,8 +66,11 @@ def submit(model=None, path="", vocab_file="", use_crf="", label2id_file=""):
            continue
        oneline = json.loads(line.strip())
        res = process(model=model, text=oneline["text"], tokenizer_=tokenizer_,
-                      use_crf=use_crf, label2id_file=label2id_file)
-        print("text", oneline["text"])
-        print("res:", res)
+                      use_crf=use_crf, tag_to_index=tag_to_index, vocab=vocab_file)
        data.append(json.dumps({"label": res}, ensure_ascii=False))
    open("ner_predict.json", "w").write("\n".join(data))
+    labels = []
+    with open(label_file) as f:
+        for label in f:
+            labels.append(label.strip())
+    get_result(labels, "ner_predict.json", path)
--- a/model_zoo/official/nlp/bert/src/sample_process.py
+++ b/model_zoo/official/nlp/bert/src/sample_process.py
@ -16,9 +16,9 @@
 """process txt"""

 import re
-import json
+from src.tokenization import convert_tokens_to_ids

-def process_one_example_p(tokenizer, text, max_seq_len=128):
+def process_one_example_p(tokenizer, vocab, text, max_seq_len=128):
    """process one testline"""
    textlist = list(text)
    tokens = []
@ -37,7 +37,7 @@ def process_one_example_p(tokenizer, text, max_seq_len=128):
        segment_ids.append(0)
    ntokens.append("[SEP]")
    segment_ids.append(0)
-    input_ids = tokenizer.convert_tokens_to_ids(ntokens)
+    input_ids = convert_tokens_to_ids(vocab, ntokens)
    input_mask = [1] * len(input_ids)
    while len(input_ids) < max_seq_len:
        input_ids.append(0)
@ -52,12 +52,12 @@ def process_one_example_p(tokenizer, text, max_seq_len=128):
    feature = (input_ids, input_mask, segment_ids)
    return feature

-def label_generation(text="", probs=None, label2id_file=""):
+def label_generation(text="", probs=None, tag_to_index=None):
    """generate label"""
    data = [text]
    probs = [probs]
    result = []
-    label2id = json.loads(open(label2id_file).read())
+    label2id = tag_to_index
    id2label = [k for k, v in label2id.items()]

    for index, prob in enumerate(probs):
--- a/model_zoo/official/nlp/bert/src/score.py
+++ b/model_zoo/official/nlp/bert/src/score.py
@ -0,0 +1,79 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+Calculate average F1 score among labels.
+"""
+
+import json
+
+def get_f1_score_for_each_label(pre_lines, gold_lines, label):
+    """
+    Get F1 score for each label.
+    Args:
+        pre_lines: listed label info from pre_file.
+        gold_lines: listed label info from gold_file.
+        label:
+
+    Returns:
+        F1 score for this label.
+    """
+    TP = 0
+    FP = 0
+    FN = 0
+    index = 0
+    while index < len(pre_lines):
+        pre_line = pre_lines[index].get(label, {})
+        gold_line = gold_lines[index].get(label, {})
+        for sample in pre_line:
+            if sample in gold_line:
+                TP += 1
+            else:
+                FP += 1
+        for sample in gold_line:
+            if sample not in pre_line:
+                FN += 1
+        index += 1
+    f1 = 2 * TP / (2 * TP + FP + FN)
+    return f1
+
+
+def get_f1_score(labels, pre_file, gold_file):
+    """
+    Get F1 scores for each label.
+    Args:
+        labels: list of labels.
+        pre_file: prediction file.
+        gold_file: ground truth file.
+
+    Returns:
+        average F1 score on all labels.
+    """
+    pre_lines = [json.loads(line.strip())['label'] for line in open(pre_file) if line.strip()]
+    gold_lines = [json.loads(line.strip())['label'] for line in open(gold_file) if line.strip()]
+    if len(pre_lines) != len(gold_lines):
+        raise ValueError("pre file and gold file have different line count.")
+    f1_sum = 0
+    for label in labels:
+        f1 = get_f1_score_for_each_label(pre_lines, gold_lines, label)
+        print('label: %s, F1: %.6f' % (label, f1))
+        f1_sum += f1
+
+    return f1_sum/len(labels)
+
+
+def get_result(labels, pre_file, gold_file):
+    avg = get_f1_score(labels, pre_file=pre_file, gold_file=gold_file)
+    print("avg F1: %.6f" % avg)
--- a/model_zoo/official/nlp/bert/src/tokenization.py
+++ b/model_zoo/official/nlp/bert/src/tokenization.py
@ -0,0 +1,329 @@
+# Copyright 2020 Huawei Technologies Co., Ltd
+#
+# Licensed under the Apache License, Version 2.0 (the "License");
+# you may not use this file except in compliance with the License.
+# You may obtain a copy of the License at
+#
+# http://www.apache.org/licenses/LICENSE-2.0
+#
+# Unless required by applicable law or agreed to in writing, software
+# distributed under the License is distributed on an "AS IS" BASIS,
+# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
+# See the License for the specific language governing permissions and
+# limitations under the License.
+# ============================================================================
+
+"""
+Tokenization.
+"""
+
+import unicodedata
+import collections
+
+def convert_to_unicode(text):
+    """
+    Convert text into unicode type.
+    Args:
+        text: input str.
+
+    Returns:
+        input str in unicode.
+    """
+    ret = text
+    if isinstance(text, str):
+        ret = text
+    elif isinstance(text, bytes):
+        ret = text.decode("utf-8", "ignore")
+    else:
+        raise ValueError("Unsupported string type: %s" % (type(text)))
+    return ret
+
+
+def vocab_to_dict_key_token(vocab_file):
+    """Loads a vocab file into a dict, key is token."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[token] = index
+            index += 1
+    return vocab
+
+
+def vocab_to_dict_key_id(vocab_file):
+    """Loads a vocab file into a dict, key is id."""
+    vocab = collections.OrderedDict()
+    index = 0
+    with open(vocab_file, "r") as reader:
+        while True:
+            token = convert_to_unicode(reader.readline())
+            if not token:
+                break
+            token = token.strip()
+            vocab[index] = token
+            index += 1
+    return vocab
+
+
+def whitespace_tokenize(text):
+    """Runs basic whitespace cleaning and splitting on a piece of text."""
+    text = text.strip()
+    if not text:
+        return []
+    tokens = text.split()
+    return tokens
+
+
+def convert_tokens_to_ids(vocab_file, tokens):
+    """
+    Convert tokens to ids.
+    Args:
+        vocab_file: path to vocab.txt.
+        tokens: list of tokens.
+
+    Returns:
+        list of ids.
+    """
+    vocab_dict = vocab_to_dict_key_token(vocab_file)
+    output = []
+    for token in tokens:
+        output.append(vocab_dict[token])
+    return output
+
+
+def convert_ids_to_tokens(vocab_file, ids):
+    """
+    Convert ids to tokens.
+    Args:
+        vocab_file: path to vocab.txt.
+        ids: list of ids.
+
+    Returns:
+        list of tokens.
+    """
+    vocab_dict = vocab_to_dict_key_id(vocab_file)
+    output = []
+    for _id in ids:
+        output.append(vocab_dict[_id])
+    return output
+
+
+class FullTokenizer():
+    """
+    Full tokenizer
+    """
+    def __init__(self, vocab_file, do_lower_case=True):
+        self.vocab_dict = vocab_to_dict_key_token(vocab_file)
+        self.do_lower_case = do_lower_case
+        self.basic_tokenize = BasicTokenizer(do_lower_case)
+        self.wordpiece_tokenize = WordpieceTokenizer(self.vocab_dict)
+
+    def tokenize(self, text):
+        """
+        Do full tokenization.
+        Args:
+            text: str of text.
+
+        Returns:
+            list of tokens.
+        """
+        tokens_ret = []
+        text = convert_to_unicode(text)
+        for tokens in self.basic_tokenize.tokenize(text):
+            wordpiece_tokens = self.wordpiece_tokenize.tokenize(tokens)
+            tokens_ret.extend(wordpiece_tokens)
+        return tokens_ret
+
+
+class BasicTokenizer():
+    """
+    Basic tokenizer
+    """
+    def __init__(self, do_lower_case=True):
+        self.do_lower_case = do_lower_case
+
+    def tokenize(self, text):
+        """
+        Do basic tokenization.
+        Args:
+            text: text in unicode.
+
+        Returns:
+            a list of tokens split from text
+        """
+        text = self._clean_text(text)
+        text = self._tokenize_chinese_chars(text)
+
+        orig_tokens = whitespace_tokenize(text)
+        split_tokens = []
+        for token in orig_tokens:
+            if self.do_lower_case:
+                token = token.lower()
+                token = self._run_strip_accents(token)
+            aaa = self._run_split_on_punc(token)
+            split_tokens.extend(aaa)
+
+        output_tokens = whitespace_tokenize(" ".join(split_tokens))
+        return output_tokens
+
+    def _run_strip_accents(self, text):
+        """Strips accents from a piece of text."""
+        text = unicodedata.normalize("NFD", text)
+        output = []
+        for char in text:
+            cat = unicodedata.category(char)
+            if cat == "Mn":
+                continue
+            output.append(char)
+        return "".join(output)
+
+    def _run_split_on_punc(self, text):
+        """Splits punctuation on a piece of text."""
+        i = 0
+        start_new_word = True
+        output = []
+        for char in text:
+            if _is_punctuation(char):
+                output.append([char])
+                start_new_word = True
+            else:
+                if start_new_word:
+                    output.append([])
+                start_new_word = False
+                output[-1].append(char)
+            i += 1
+        return ["".join(x) for x in output]
+
+    def _clean_text(self, text):
+        """Performs invalid character removal and whitespace cleanup on text."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if cp == 0 or cp == 0xfffd or _is_control(char):
+                continue
+            if _is_whitespace(char):
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _tokenize_chinese_chars(self, text):
+        """Adds whitespace around any CJK character."""
+        output = []
+        for char in text:
+            cp = ord(char)
+            if self._is_chinese_char(cp):
+                output.append(" ")
+                output.append(char)
+                output.append(" ")
+            else:
+                output.append(char)
+        return "".join(output)
+
+    def _is_chinese_char(self, cp):
+        """Checks whether CP is the codepoint of a CJK character."""
+        # This defines a "chinese character" as anything in the CJK Unicode block:
+        #   https://en.wikipedia.org/wiki/CJK_Unified_Ideographs_(Unicode_block)
+        #
+        # Note that the CJK Unicode block is NOT all Japanese and Korean characters,
+        # despite its name. The modern Korean Hangul alphabet is a different block,
+        # as is Japanese Hiragana and Katakana. Those alphabets are used to write
+        # space-separated words, so they are not treated specially and handled
+        # like the all of the other languages.
+        if ((0x4E00 <= cp <= 0x9FFF) or
+                (0x3400 <= cp <= 0x4DBF) or
+                (0x20000 <= cp <= 0x2A6DF) or
+                (0x2A700 <= cp <= 0x2B73F) or
+                (0x2B740 <= cp <= 0x2B81F) or
+                (0x2B820 <= cp <= 0x2CEAF) or
+                (0xF900 <= cp <= 0xFAFF) or
+                (0x2F800 <= cp <= 0x2FA1F)):
+            return True
+
+        return False
+
+
+class WordpieceTokenizer():
+    """
+    Wordpiece tokenizer
+    """
+    def __init__(self, vocab):
+        self.vocab_dict = vocab
+
+    def tokenize(self, tokens):
+        """
+        Do word-piece tokenization
+        Args:
+            tokens: a word.
+
+        Returns:
+            a list of tokens that can be found in vocab dict.
+        """
+        output_tokens = []
+        tokens = convert_to_unicode(tokens)
+        for token in whitespace_tokenize(tokens):
+            chars = list(token)
+            len_chars = len(chars)
+            start = 0
+            end = len_chars
+            while start < len_chars:
+                while start < end:
+                    substr = "".join(token[start:end])
+                    if start != 0:
+                        substr = "##" + substr
+                    if substr in self.vocab_dict:
+                        output_tokens.append(substr)
+                        start = end
+                        end = len_chars
+                    else:
+                        end = end - 1
+                if start == end and start != len_chars:
+                    output_tokens.append("[UNK]")
+                    break
+        return output_tokens
+
+
+def _is_whitespace(char):
+    """Checks whether `chars` is a whitespace character."""
+    # \t, \n, and \r are technically contorl characters but we treat them
+    # as whitespace since they are generally considered as such.
+    whitespace_char = [" ", "\t", "\n", "\r"]
+    if char in whitespace_char:
+        return True
+    cat = unicodedata.category(char)
+    if cat == "Zs":
+        return True
+    return False
+
+
+def _is_control(char):
+    """Checks whether `chars` is a control character."""
+    # These are technically control characters but we count them as whitespace
+    # characters.
+    control_char = ["\t", "\n", "\r"]
+    if char in control_char:
+        return False
+    cat = unicodedata.category(char)
+    if cat in ("Cc", "Cf"):
+        return True
+    return False
+
+
+def _is_punctuation(char):
+    """Checks whether `chars` is a punctuation character."""
+    cp = ord(char)
+    # We treat all non-letter/number ASCII as punctuation.
+    # Characters such as "^", "$", and "`" are not in the Unicode
+    # Punctuation class but we treat them as punctuation anyways, for
+    # consistency.
+    if ((33 <= cp <= 47) or (58 <= cp <= 64) or
+            (91 <= cp <= 96) or (123 <= cp <= 126)):
+        return True
+    cat = unicodedata.category(char)
+    if cat.startswith("P"):
+        return True
+    return False
--- a/model_zoo/official/nlp/bert/src/utils.py
+++ b/model_zoo/official/nlp/bert/src/utils.py
@ -19,6 +19,7 @@ Functional Cells used in Bert finetune and evaluation.

 import os
 import math
+import collections
 import numpy as np
 import mindspore.nn as nn
 from mindspore import log as logger
@ -213,3 +214,19 @@ class BertLearningRate(LearningRateSchedule):
        else:
            lr = decay_lr
        return lr
+
+
+def convert_labels_to_index(label_list):
+    """
+    Convert label_list to indices for NER task.
+    """
+    label2id = collections.OrderedDict()
+    label2id["O"] = 0
+    prefix = ["S_", "B_", "M_", "E_"]
+    index = 0
+    for label in label_list:
+        for pre in prefix:
+            index += 1
+            sub_label = pre + label
+            label2id[sub_label] = index
+    return label2id