基于spaCy库训练NER模型

This commit is contained in:
huaian_zhou 2024-01-12 14:13:51 +08:00
commit 444178debe
20 changed files with 1351319 additions and 0 deletions

149
ner/code/config.cfg Normal file
View File

@ -0,0 +1,149 @@
[paths]
train = null
dev = null
vectors = null
init_tok2vec = null
[system]
gpu_allocator = "pytorch"
seed = 0
[nlp]
lang = "en"
pipeline = ["transformer","ner"]
batch_size = 128
disabled = []
before_creation = null
after_creation = null
after_pipeline_creation = null
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
vectors = {"@vectors":"spacy.Vectors.v1"}
[components]
[components.ner]
factory = "ner"
incorrect_spans_key = null
moves = null
scorer = {"@scorers":"spacy.ner_scorer.v1"}
update_with_oracle_cut_size = 100
[components.ner.model]
@architectures = "spacy.TransitionBasedParser.v2"
state_type = "ner"
extra_state_tokens = false
hidden_width = 64
maxout_pieces = 2
use_upper = false
nO = null
[components.ner.model.tok2vec]
@architectures = "spacy-transformers.TransformerListener.v1"
grad_factor = 1.0
pooling = {"@layers":"reduce_mean.v1"}
upstream = "*"
[components.transformer]
factory = "transformer"
max_batch_items = 4096
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
[components.transformer.model]
@architectures = "spacy-transformers.TransformerModel.v3"
# name = "roberta-base"
# 若HuggingFace连接失败则把模型文件下载到本地并修改name
name = "../cache/roberta_base"
mixed_precision = false
[components.transformer.model.get_spans]
@span_getters = "spacy-transformers.strided_spans.v1"
window = 128
stride = 96
[components.transformer.model.grad_scaler_config]
[components.transformer.model.tokenizer_config]
use_fast = true
[components.transformer.model.transformer_config]
[corpora]
[corpora.dev]
@readers = "spacy.Corpus.v1"
path = ${paths.dev}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[corpora.train]
@readers = "spacy.Corpus.v1"
path = ${paths.train}
max_length = 0
gold_preproc = false
limit = 0
augmenter = null
[training]
accumulate_gradient = 3
dev_corpus = "corpora.dev"
train_corpus = "corpora.train"
seed = ${system.seed}
gpu_allocator = ${system.gpu_allocator}
dropout = 0.1
patience = 1600
max_epochs = 0
max_steps = 20000
eval_frequency = 200
frozen_components = []
annotating_components = []
before_to_disk = null
before_update = null
[training.batcher]
@batchers = "spacy.batch_by_padded.v1"
discard_oversize = true
size = 2000
buffer = 256
get_length = null
[training.logger]
@loggers = "spacy.ConsoleLogger.v1"
progress_bar = false
[training.optimizer]
@optimizers = "Adam.v1"
beta1 = 0.9
beta2 = 0.999
L2_is_weight_decay = true
L2 = 0.01
grad_clip = 1.0
use_averages = false
eps = 0.00000001
[training.optimizer.learn_rate]
@schedules = "warmup_linear.v1"
warmup_steps = 250
total_steps = 20000
initial_rate = 0.00005
[training.score_weights]
ents_f = 1.0
ents_p = 0.0
ents_r = 0.0
ents_per_type = null
[pretraining]
[initialize]
vectors = ${paths.vectors}
init_tok2vec = ${paths.init_tok2vec}
vocab_data = null
lookups = null
before_init = null
after_init = null
[initialize.components]
[initialize.tokenizer]

6
ner/code/gene_config.sh Normal file
View File

@ -0,0 +1,6 @@
python -m spacy init config ./config.cfg \
--lang en \
--pipeline ner \
--optimize accuracy \
--gpu \
--force

382
ner/code/gene_ner_ds.ipynb Normal file
View File

@ -0,0 +1,382 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/huaian/mambaforge/envs/mytrans/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import json\n",
"import random\n",
"import spacy\n",
"from tqdm import tqdm\n",
"from pathlib import Path\n",
"from spacy.tokens import DocBin\n",
"\n",
"random.seed(42)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [],
"source": [
"def generate_ner_ds(file_path: Path, save_dir: Path):\n",
" \"\"\"创建符合spacy2.x库格式要求的NER数据集\"\"\"\n",
" with open(file_path, \"r\") as reader:\n",
" all_lines = reader.readlines()\n",
"\n",
" dataset = []\n",
" token_ls, anno_ls = [], []\n",
" for line in tqdm(all_lines):\n",
" # 保存同一个句子的token和annotation到列表中\n",
" line = line.strip()\n",
" if line != \"\":\n",
" token, annotation = line.split(\"\\t\")[0:2]\n",
" token_ls.append(token)\n",
" anno_ls.append(annotation)\n",
"\n",
" # 拼接成一个句子并记录annotation的位置\n",
" elif len(token_ls) != 0:\n",
" # 舍弃\"CODE_BLOCK\"开头的句子\n",
" if token_ls[0] != \"CODE_BLOCK\":\n",
" sentence = \"\"\n",
" anno_span_ls = [] # 记录所有annotation的位置\n",
" for tok, anno in zip(token_ls, anno_ls):\n",
" sentence += tok\n",
" if anno != \"O\":\n",
" anno_span = (\n",
" len(sentence) - len(tok),\n",
" len(sentence),\n",
" \"CODE_ENT\",\n",
" )\n",
" anno_span_ls.append(anno_span)\n",
" sentence += \" \"\n",
" if len(anno_span_ls) != 0:\n",
" dataset.append((sentence.strip(), {\"entities\": anno_span_ls}))\n",
" token_ls, anno_ls = [], []\n",
"\n",
" print(\"NER dataset[0:5]\")\n",
" for item in dataset[0:5]:\n",
" print(f\"\\t{item}\")\n",
"\n",
" # # save_dir = Path(\"../../data/ner_dataset\")\n",
"\n",
" save_dir.mkdir(parents=True, exist_ok=True)\n",
" save_path = save_dir / (file_path.stem + \".json\")\n",
" with open(save_path, \"w\") as f:\n",
" json.dump(dataset, f, ensure_ascii=False, indent=2)\n",
" print(f\"File {save_path.name} saved!\")\n",
" print(\"=\" * 20)"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 86911/86911 [00:00<00:00, 1733163.86it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"NER dataset[0:5]\n",
"\t('@petergoldstein Thanks for submitting this PR !', {'entities': [(0, 15, 'CODE_ENT')]})\n",
"\t(\"I 'm closing in favor of #13 and other changes already in master that support ActiveRecord 4+ .\", {'entities': [(78, 90, 'CODE_ENT'), (91, 93, 'CODE_ENT')]})\n",
"\t('Currently everything works OK if only one scope is present , however the setup() method has no way of discriminating devices by serial number , and we automatically select the first scope we find .', {'entities': [(73, 80, 'CODE_ENT')]})\n",
"\t('R.I.Pineear has a nice blog post ( partially ) about this .', {'entities': [(0, 11, 'CODE_ENT')]})\n",
"\t('I like the idea of repository and build metadata embedded in the image .', {'entities': [(65, 70, 'CODE_ENT')]})\n",
"File GH_test_set.json saved!\n",
"====================\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 180996/180996 [00:00<00:00, 1418604.94it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"NER dataset[0:5]\n",
"\t('If I would have 2 tables', {'entities': [(18, 24, 'CODE_ENT')]})\n",
"\t('SQLFIDDLE : http://sqlfiddle.com/#!9/11093', {'entities': [(0, 9, 'CODE_ENT')]})\n",
"\t('Just add a where clause :', {'entities': [(11, 16, 'CODE_ENT')]})\n",
"\t('A more traditional approach uses NOT EXISTS :', {'entities': [(33, 36, 'CODE_ENT'), (37, 43, 'CODE_ENT')]})\n",
"\t('Here is a SQL Fiddle illustrating that the first works .', {'entities': [(10, 13, 'CODE_ENT'), (14, 20, 'CODE_ENT')]})\n",
"File train.json saved!\n",
"====================\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 57023/57023 [00:00<00:00, 1870912.15it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"NER dataset[0:5]\n",
"\t('In Java + = operator has an implicit cast to the left hand type .', {'entities': [(3, 7, 'CODE_ENT'), (8, 9, 'CODE_ENT'), (10, 11, 'CODE_ENT')]})\n",
"\t('As everyone already stated , the + = has an implicit cast .', {'entities': [(33, 34, 'CODE_ENT'), (35, 36, 'CODE_ENT')]})\n",
"\t('And a table of their meanings :', {'entities': [(6, 11, 'CODE_ENT')]})\n",
"\t(\"So let 's take a look at the bytecode from some simple Java code :\", {'entities': [(55, 59, 'CODE_ENT')]})\n",
"\t('My comments will have a // in front .', {'entities': [(24, 26, 'CODE_ENT')]})\n",
"File dev.json saved!\n",
"====================\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 60745/60745 [00:00<00:00, 1902288.40it/s]\n"
]
},
{
"name": "stdout",
"output_type": "stream",
"text": [
"NER dataset[0:5]\n",
"\t('I am using custom adapter which I use for my ListView .', {'entities': [(18, 25, 'CODE_ENT'), (45, 53, 'CODE_ENT')]})\n",
"\t('After creating ArrayList', {'entities': [(15, 24, 'CODE_ENT')]})\n",
"\t('However , when I try to click on the checkbox , nothing happens .', {'entities': [(37, 45, 'CODE_ENT')]})\n",
"\t('So I have to manage toggling checkbox state manually .', {'entities': [(29, 37, 'CODE_ENT')]})\n",
"\t('( before that I have to remove setChoiceMode method call )', {'entities': [(31, 44, 'CODE_ENT')]})\n",
"File test.json saved!\n",
"====================\n"
]
}
],
"source": [
"data_dir = Path(\"../../data/annotated_ner_data\")\n",
"dataset_dir = Path(\"../../data/ner_dataset\")\n",
"file_names = [\n",
" \"GitHub/GH_test_set.txt\",\n",
" \"StackOverflow/train.txt\",\n",
" \"StackOverflow/dev.txt\",\n",
" \"StackOverflow/test.txt\",\n",
"]\n",
"\n",
"for file_name in file_names:\n",
" file_path = data_dir / file_name\n",
" generate_ner_ds(file_path, dataset_dir)"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [],
"source": [
"def split_train_test_ds(dataset_dir: Path, split_rate=0.9):\n",
" \"\"\"分割训练集和测试集\"\"\"\n",
" ner_ds = [] # 保存所有NER样本的数据集\n",
"\n",
" for file_path in dataset_dir.iterdir():\n",
" with open(file_path, \"r\") as f:\n",
" lines = json.load(f)\n",
" print(f\"File {file_path.name} #samples: {len(lines)}\")\n",
" ner_ds.extend(lines)\n",
"\n",
" print(f\"NER dataset #sample: {len(ner_ds)}\")\n",
" with open(dataset_dir / \"ner_ds.json\", \"w\") as f:\n",
" json.dump(ner_ds, f, ensure_ascii=False, indent=2)\n",
"\n",
" # 生成随机索引\n",
" idx = list(range(len(ner_ds)))\n",
" random.shuffle(idx)\n",
"\n",
" # 划分训练集、测试集并保存\n",
" split_idx = int(split_rate * len(ner_ds))\n",
" ner_train_ds = [ner_ds[i] for i in idx[:split_idx]]\n",
" print(f\"NER train dataset #sample: {len(ner_train_ds)}\")\n",
" with open(dataset_dir / \"ner_train_ds.json\", \"w\") as f:\n",
" json.dump(ner_train_ds, f, ensure_ascii=False, indent=2)\n",
"\n",
" ner_test_ds = [ner_ds[i] for i in idx[split_idx:]]\n",
" print(f\"NER test dataset #sample: {len(ner_test_ds)}\")\n",
" with open(dataset_dir / \"ner_test_ds.json\", \"w\") as f:\n",
" json.dump(ner_test_ds, f, ensure_ascii=False, indent=2)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"File test.json #samples: 1982\n",
"File train.json #samples: 5868\n",
"File dev.json #samples: 1857\n",
"File GH_test_set.json #samples: 3219\n",
"NER dataset #sample: 12926\n",
"NER train dataset #sample: 11633\n",
"NER test dataset #sample: 1293\n"
]
}
],
"source": [
"split_train_test_ds(dataset_dir)"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [],
"source": [
"def convert_ner_ds_format(file_path: Path, save_dir: Path):\n",
" \"\"\"将spacy2.x库所需格式的NER数据集转换为3.x库所需的格式\"\"\"\n",
" with open(file_path, \"r\") as f:\n",
" dataset = json.load(f)\n",
"\n",
" nlp = spacy.blank(\"en\") # load a new spacy model\n",
" db = DocBin() # create a DocBin object\n",
"\n",
" for text, anno in tqdm(dataset): # data in previous format\n",
" doc = nlp.make_doc(text) # create doc object from text\n",
" ents = []\n",
" for start, end, label in anno[\"entities\"]:\n",
" span = doc.char_span(start, end, label=label, alignment_mode=\"contract\")\n",
" if span is None:\n",
" print(\"Skipping entity\")\n",
" else:\n",
" ents.append(span)\n",
" doc.ents = ents\n",
" db.add(doc)\n",
" save_path = save_dir / (file_path.stem + \".spacy\")\n",
" db.to_disk(save_path)"
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"100%|██████████| 11633/11633 [00:01<00:00, 7864.58it/s]\n",
"100%|██████████| 1293/1293 [00:00<00:00, 6090.11it/s]\n"
]
}
],
"source": [
"convert_ner_ds_format(dataset_dir / \"ner_train_ds.json\", dataset_dir)\n",
"convert_ner_ds_format(dataset_dir / \"ner_test_ds.json\", dataset_dir)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"foo_idx = list(range(10))\n",
"print(foo_idx)\n",
"random.shuffle(foo_idx)\n",
"print(foo_idx)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(foo_idx[(1, 3, 5)])"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"foo_path = Path(\"../../data/ner_dataset\")\n",
"for item in foo_path.iterdir():\n",
" print(item)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(foo_path)"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": [
"print(foo_path.is_dir())"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "tld",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

84
ner/code/ner_test.ipynb Normal file
View File

@ -0,0 +1,84 @@
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/home/huaian/mambaforge/envs/mytrans/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
" from .autonotebook import tqdm as notebook_tqdm\n"
]
}
],
"source": [
"import spacy"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">Then use \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
" AJAX\n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">CODE_ENT</span>\n",
"</mark>\n",
" to submit the \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
" form\n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">CODE_ENT</span>\n",
"</mark>\n",
" and show results in the \n",
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
" #results\n",
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">CODE_ENT</span>\n",
"</mark>\n",
" -container</div></span>"
],
"text/plain": [
"<IPython.core.display.HTML object>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"nlp = spacy.load(\"../model/model-best/\")\n",
"text = \"Then use AJAX to submit the form and show results in the #results -container\"\n",
"doc = nlp(text)\n",
"\n",
"spacy.displacy.render(doc, style=\"ent\", jupyter=True)"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "mytrans",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.18"
}
},
"nbformat": 4,
"nbformat_minor": 2
}

View File

@ -0,0 +1,6 @@
python -m spacy train ./config.cfg \
--output ../model \
--gpu-id 0 \
--paths.train ../data/ner_dataset/ner_train_ds.spacy \
--paths.dev ../data/ner_dataset/ner_test_ds.spacy \
--system.seed 42

File diff suppressed because it is too large Load Diff

View File

@ -0,0 +1,11 @@
# Data format:
In datasets are represented in the Conll format. In this format each line of the is in the following format:
<word>+"\t"+<NE>"\t"+<word>+"\t"<markdown>
The end of sentence is marked with an empty line.
In each line `NE` represented the human annotated named entity and `<markdown>` represented the code tags provided by the users who wrote the posts.

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

37811
ner/data/ner_dataset/dev.json Normal file

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

File diff suppressed because it is too large Load Diff

Binary file not shown.

File diff suppressed because it is too large Load Diff

Binary file not shown.

40896
ner/data/ner_dataset/test.json Normal file

File diff suppressed because it is too large Load Diff

120643
ner/data/ner_dataset/train.json Normal file

File diff suppressed because it is too large Load Diff