基于spaCy库训练NER模型
This commit is contained in:
commit
444178debe
|
@ -0,0 +1,149 @@
|
||||||
|
[paths]
|
||||||
|
train = null
|
||||||
|
dev = null
|
||||||
|
vectors = null
|
||||||
|
init_tok2vec = null
|
||||||
|
|
||||||
|
[system]
|
||||||
|
gpu_allocator = "pytorch"
|
||||||
|
seed = 0
|
||||||
|
|
||||||
|
[nlp]
|
||||||
|
lang = "en"
|
||||||
|
pipeline = ["transformer","ner"]
|
||||||
|
batch_size = 128
|
||||||
|
disabled = []
|
||||||
|
before_creation = null
|
||||||
|
after_creation = null
|
||||||
|
after_pipeline_creation = null
|
||||||
|
tokenizer = {"@tokenizers":"spacy.Tokenizer.v1"}
|
||||||
|
vectors = {"@vectors":"spacy.Vectors.v1"}
|
||||||
|
|
||||||
|
[components]
|
||||||
|
|
||||||
|
[components.ner]
|
||||||
|
factory = "ner"
|
||||||
|
incorrect_spans_key = null
|
||||||
|
moves = null
|
||||||
|
scorer = {"@scorers":"spacy.ner_scorer.v1"}
|
||||||
|
update_with_oracle_cut_size = 100
|
||||||
|
|
||||||
|
[components.ner.model]
|
||||||
|
@architectures = "spacy.TransitionBasedParser.v2"
|
||||||
|
state_type = "ner"
|
||||||
|
extra_state_tokens = false
|
||||||
|
hidden_width = 64
|
||||||
|
maxout_pieces = 2
|
||||||
|
use_upper = false
|
||||||
|
nO = null
|
||||||
|
|
||||||
|
[components.ner.model.tok2vec]
|
||||||
|
@architectures = "spacy-transformers.TransformerListener.v1"
|
||||||
|
grad_factor = 1.0
|
||||||
|
pooling = {"@layers":"reduce_mean.v1"}
|
||||||
|
upstream = "*"
|
||||||
|
|
||||||
|
[components.transformer]
|
||||||
|
factory = "transformer"
|
||||||
|
max_batch_items = 4096
|
||||||
|
set_extra_annotations = {"@annotation_setters":"spacy-transformers.null_annotation_setter.v1"}
|
||||||
|
|
||||||
|
[components.transformer.model]
|
||||||
|
@architectures = "spacy-transformers.TransformerModel.v3"
|
||||||
|
# name = "roberta-base"
|
||||||
|
# 若HuggingFace连接失败,则把模型文件下载到本地,并修改name
|
||||||
|
name = "../cache/roberta_base"
|
||||||
|
mixed_precision = false
|
||||||
|
|
||||||
|
[components.transformer.model.get_spans]
|
||||||
|
@span_getters = "spacy-transformers.strided_spans.v1"
|
||||||
|
window = 128
|
||||||
|
stride = 96
|
||||||
|
|
||||||
|
[components.transformer.model.grad_scaler_config]
|
||||||
|
|
||||||
|
[components.transformer.model.tokenizer_config]
|
||||||
|
use_fast = true
|
||||||
|
|
||||||
|
[components.transformer.model.transformer_config]
|
||||||
|
|
||||||
|
[corpora]
|
||||||
|
|
||||||
|
[corpora.dev]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.dev}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[corpora.train]
|
||||||
|
@readers = "spacy.Corpus.v1"
|
||||||
|
path = ${paths.train}
|
||||||
|
max_length = 0
|
||||||
|
gold_preproc = false
|
||||||
|
limit = 0
|
||||||
|
augmenter = null
|
||||||
|
|
||||||
|
[training]
|
||||||
|
accumulate_gradient = 3
|
||||||
|
dev_corpus = "corpora.dev"
|
||||||
|
train_corpus = "corpora.train"
|
||||||
|
seed = ${system.seed}
|
||||||
|
gpu_allocator = ${system.gpu_allocator}
|
||||||
|
dropout = 0.1
|
||||||
|
patience = 1600
|
||||||
|
max_epochs = 0
|
||||||
|
max_steps = 20000
|
||||||
|
eval_frequency = 200
|
||||||
|
frozen_components = []
|
||||||
|
annotating_components = []
|
||||||
|
before_to_disk = null
|
||||||
|
before_update = null
|
||||||
|
|
||||||
|
[training.batcher]
|
||||||
|
@batchers = "spacy.batch_by_padded.v1"
|
||||||
|
discard_oversize = true
|
||||||
|
size = 2000
|
||||||
|
buffer = 256
|
||||||
|
get_length = null
|
||||||
|
|
||||||
|
[training.logger]
|
||||||
|
@loggers = "spacy.ConsoleLogger.v1"
|
||||||
|
progress_bar = false
|
||||||
|
|
||||||
|
[training.optimizer]
|
||||||
|
@optimizers = "Adam.v1"
|
||||||
|
beta1 = 0.9
|
||||||
|
beta2 = 0.999
|
||||||
|
L2_is_weight_decay = true
|
||||||
|
L2 = 0.01
|
||||||
|
grad_clip = 1.0
|
||||||
|
use_averages = false
|
||||||
|
eps = 0.00000001
|
||||||
|
|
||||||
|
[training.optimizer.learn_rate]
|
||||||
|
@schedules = "warmup_linear.v1"
|
||||||
|
warmup_steps = 250
|
||||||
|
total_steps = 20000
|
||||||
|
initial_rate = 0.00005
|
||||||
|
|
||||||
|
[training.score_weights]
|
||||||
|
ents_f = 1.0
|
||||||
|
ents_p = 0.0
|
||||||
|
ents_r = 0.0
|
||||||
|
ents_per_type = null
|
||||||
|
|
||||||
|
[pretraining]
|
||||||
|
|
||||||
|
[initialize]
|
||||||
|
vectors = ${paths.vectors}
|
||||||
|
init_tok2vec = ${paths.init_tok2vec}
|
||||||
|
vocab_data = null
|
||||||
|
lookups = null
|
||||||
|
before_init = null
|
||||||
|
after_init = null
|
||||||
|
|
||||||
|
[initialize.components]
|
||||||
|
|
||||||
|
[initialize.tokenizer]
|
|
@ -0,0 +1,6 @@
|
||||||
|
python -m spacy init config ./config.cfg \
|
||||||
|
--lang en \
|
||||||
|
--pipeline ner \
|
||||||
|
--optimize accuracy \
|
||||||
|
--gpu \
|
||||||
|
--force
|
|
@ -0,0 +1,382 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/huaian/mambaforge/envs/mytrans/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||||
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import json\n",
|
||||||
|
"import random\n",
|
||||||
|
"import spacy\n",
|
||||||
|
"from tqdm import tqdm\n",
|
||||||
|
"from pathlib import Path\n",
|
||||||
|
"from spacy.tokens import DocBin\n",
|
||||||
|
"\n",
|
||||||
|
"random.seed(42)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def generate_ner_ds(file_path: Path, save_dir: Path):\n",
|
||||||
|
" \"\"\"创建符合spacy2.x库格式要求的NER数据集\"\"\"\n",
|
||||||
|
" with open(file_path, \"r\") as reader:\n",
|
||||||
|
" all_lines = reader.readlines()\n",
|
||||||
|
"\n",
|
||||||
|
" dataset = []\n",
|
||||||
|
" token_ls, anno_ls = [], []\n",
|
||||||
|
" for line in tqdm(all_lines):\n",
|
||||||
|
" # 保存同一个句子的token和annotation到列表中\n",
|
||||||
|
" line = line.strip()\n",
|
||||||
|
" if line != \"\":\n",
|
||||||
|
" token, annotation = line.split(\"\\t\")[0:2]\n",
|
||||||
|
" token_ls.append(token)\n",
|
||||||
|
" anno_ls.append(annotation)\n",
|
||||||
|
"\n",
|
||||||
|
" # 拼接成一个句子,并记录annotation的位置\n",
|
||||||
|
" elif len(token_ls) != 0:\n",
|
||||||
|
" # 舍弃\"CODE_BLOCK\"开头的句子\n",
|
||||||
|
" if token_ls[0] != \"CODE_BLOCK\":\n",
|
||||||
|
" sentence = \"\"\n",
|
||||||
|
" anno_span_ls = [] # 记录所有annotation的位置\n",
|
||||||
|
" for tok, anno in zip(token_ls, anno_ls):\n",
|
||||||
|
" sentence += tok\n",
|
||||||
|
" if anno != \"O\":\n",
|
||||||
|
" anno_span = (\n",
|
||||||
|
" len(sentence) - len(tok),\n",
|
||||||
|
" len(sentence),\n",
|
||||||
|
" \"CODE_ENT\",\n",
|
||||||
|
" )\n",
|
||||||
|
" anno_span_ls.append(anno_span)\n",
|
||||||
|
" sentence += \" \"\n",
|
||||||
|
" if len(anno_span_ls) != 0:\n",
|
||||||
|
" dataset.append((sentence.strip(), {\"entities\": anno_span_ls}))\n",
|
||||||
|
" token_ls, anno_ls = [], []\n",
|
||||||
|
"\n",
|
||||||
|
" print(\"NER dataset[0:5]\")\n",
|
||||||
|
" for item in dataset[0:5]:\n",
|
||||||
|
" print(f\"\\t{item}\")\n",
|
||||||
|
"\n",
|
||||||
|
" # # save_dir = Path(\"../../data/ner_dataset\")\n",
|
||||||
|
"\n",
|
||||||
|
" save_dir.mkdir(parents=True, exist_ok=True)\n",
|
||||||
|
" save_path = save_dir / (file_path.stem + \".json\")\n",
|
||||||
|
" with open(save_path, \"w\") as f:\n",
|
||||||
|
" json.dump(dataset, f, ensure_ascii=False, indent=2)\n",
|
||||||
|
" print(f\"File {save_path.name} saved!\")\n",
|
||||||
|
" print(\"=\" * 20)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 3,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████| 86911/86911 [00:00<00:00, 1733163.86it/s]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"NER dataset[0:5]\n",
|
||||||
|
"\t('@petergoldstein Thanks for submitting this PR !', {'entities': [(0, 15, 'CODE_ENT')]})\n",
|
||||||
|
"\t(\"I 'm closing in favor of #13 and other changes already in master that support ActiveRecord 4+ .\", {'entities': [(78, 90, 'CODE_ENT'), (91, 93, 'CODE_ENT')]})\n",
|
||||||
|
"\t('Currently everything works OK if only one scope is present , however the setup() method has no way of discriminating devices by serial number , and we automatically select the first scope we find .', {'entities': [(73, 80, 'CODE_ENT')]})\n",
|
||||||
|
"\t('R.I.Pineear has a nice blog post ( partially ) about this .', {'entities': [(0, 11, 'CODE_ENT')]})\n",
|
||||||
|
"\t('I like the idea of repository and build metadata embedded in the image .', {'entities': [(65, 70, 'CODE_ENT')]})\n",
|
||||||
|
"File GH_test_set.json saved!\n",
|
||||||
|
"====================\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████| 180996/180996 [00:00<00:00, 1418604.94it/s]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"NER dataset[0:5]\n",
|
||||||
|
"\t('If I would have 2 tables', {'entities': [(18, 24, 'CODE_ENT')]})\n",
|
||||||
|
"\t('SQLFIDDLE : http://sqlfiddle.com/#!9/11093', {'entities': [(0, 9, 'CODE_ENT')]})\n",
|
||||||
|
"\t('Just add a where clause :', {'entities': [(11, 16, 'CODE_ENT')]})\n",
|
||||||
|
"\t('A more traditional approach uses NOT EXISTS :', {'entities': [(33, 36, 'CODE_ENT'), (37, 43, 'CODE_ENT')]})\n",
|
||||||
|
"\t('Here is a SQL Fiddle illustrating that the first works .', {'entities': [(10, 13, 'CODE_ENT'), (14, 20, 'CODE_ENT')]})\n",
|
||||||
|
"File train.json saved!\n",
|
||||||
|
"====================\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████| 57023/57023 [00:00<00:00, 1870912.15it/s]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"NER dataset[0:5]\n",
|
||||||
|
"\t('In Java + = operator has an implicit cast to the left hand type .', {'entities': [(3, 7, 'CODE_ENT'), (8, 9, 'CODE_ENT'), (10, 11, 'CODE_ENT')]})\n",
|
||||||
|
"\t('As everyone already stated , the + = has an implicit cast .', {'entities': [(33, 34, 'CODE_ENT'), (35, 36, 'CODE_ENT')]})\n",
|
||||||
|
"\t('And a table of their meanings :', {'entities': [(6, 11, 'CODE_ENT')]})\n",
|
||||||
|
"\t(\"So let 's take a look at the bytecode from some simple Java code :\", {'entities': [(55, 59, 'CODE_ENT')]})\n",
|
||||||
|
"\t('My comments will have a // in front .', {'entities': [(24, 26, 'CODE_ENT')]})\n",
|
||||||
|
"File dev.json saved!\n",
|
||||||
|
"====================\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████| 60745/60745 [00:00<00:00, 1902288.40it/s]\n"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"NER dataset[0:5]\n",
|
||||||
|
"\t('I am using custom adapter which I use for my ListView .', {'entities': [(18, 25, 'CODE_ENT'), (45, 53, 'CODE_ENT')]})\n",
|
||||||
|
"\t('After creating ArrayList', {'entities': [(15, 24, 'CODE_ENT')]})\n",
|
||||||
|
"\t('However , when I try to click on the checkbox , nothing happens .', {'entities': [(37, 45, 'CODE_ENT')]})\n",
|
||||||
|
"\t('So I have to manage toggling checkbox state manually .', {'entities': [(29, 37, 'CODE_ENT')]})\n",
|
||||||
|
"\t('( before that I have to remove setChoiceMode method call )', {'entities': [(31, 44, 'CODE_ENT')]})\n",
|
||||||
|
"File test.json saved!\n",
|
||||||
|
"====================\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"data_dir = Path(\"../../data/annotated_ner_data\")\n",
|
||||||
|
"dataset_dir = Path(\"../../data/ner_dataset\")\n",
|
||||||
|
"file_names = [\n",
|
||||||
|
" \"GitHub/GH_test_set.txt\",\n",
|
||||||
|
" \"StackOverflow/train.txt\",\n",
|
||||||
|
" \"StackOverflow/dev.txt\",\n",
|
||||||
|
" \"StackOverflow/test.txt\",\n",
|
||||||
|
"]\n",
|
||||||
|
"\n",
|
||||||
|
"for file_name in file_names:\n",
|
||||||
|
" file_path = data_dir / file_name\n",
|
||||||
|
" generate_ner_ds(file_path, dataset_dir)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 4,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def split_train_test_ds(dataset_dir: Path, split_rate=0.9):\n",
|
||||||
|
" \"\"\"分割训练集和测试集\"\"\"\n",
|
||||||
|
" ner_ds = [] # 保存所有NER样本的数据集\n",
|
||||||
|
"\n",
|
||||||
|
" for file_path in dataset_dir.iterdir():\n",
|
||||||
|
" with open(file_path, \"r\") as f:\n",
|
||||||
|
" lines = json.load(f)\n",
|
||||||
|
" print(f\"File {file_path.name} #samples: {len(lines)}\")\n",
|
||||||
|
" ner_ds.extend(lines)\n",
|
||||||
|
"\n",
|
||||||
|
" print(f\"NER dataset #sample: {len(ner_ds)}\")\n",
|
||||||
|
" with open(dataset_dir / \"ner_ds.json\", \"w\") as f:\n",
|
||||||
|
" json.dump(ner_ds, f, ensure_ascii=False, indent=2)\n",
|
||||||
|
"\n",
|
||||||
|
" # 生成随机索引\n",
|
||||||
|
" idx = list(range(len(ner_ds)))\n",
|
||||||
|
" random.shuffle(idx)\n",
|
||||||
|
"\n",
|
||||||
|
" # 划分训练集、测试集并保存\n",
|
||||||
|
" split_idx = int(split_rate * len(ner_ds))\n",
|
||||||
|
" ner_train_ds = [ner_ds[i] for i in idx[:split_idx]]\n",
|
||||||
|
" print(f\"NER train dataset #sample: {len(ner_train_ds)}\")\n",
|
||||||
|
" with open(dataset_dir / \"ner_train_ds.json\", \"w\") as f:\n",
|
||||||
|
" json.dump(ner_train_ds, f, ensure_ascii=False, indent=2)\n",
|
||||||
|
"\n",
|
||||||
|
" ner_test_ds = [ner_ds[i] for i in idx[split_idx:]]\n",
|
||||||
|
" print(f\"NER test dataset #sample: {len(ner_test_ds)}\")\n",
|
||||||
|
" with open(dataset_dir / \"ner_test_ds.json\", \"w\") as f:\n",
|
||||||
|
" json.dump(ner_test_ds, f, ensure_ascii=False, indent=2)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 5,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stdout",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"File test.json #samples: 1982\n",
|
||||||
|
"File train.json #samples: 5868\n",
|
||||||
|
"File dev.json #samples: 1857\n",
|
||||||
|
"File GH_test_set.json #samples: 3219\n",
|
||||||
|
"NER dataset #sample: 12926\n",
|
||||||
|
"NER train dataset #sample: 11633\n",
|
||||||
|
"NER test dataset #sample: 1293\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"split_train_test_ds(dataset_dir)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 6,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"def convert_ner_ds_format(file_path: Path, save_dir: Path):\n",
|
||||||
|
" \"\"\"将spacy2.x库所需格式的NER数据集转换为3.x库所需的格式\"\"\"\n",
|
||||||
|
" with open(file_path, \"r\") as f:\n",
|
||||||
|
" dataset = json.load(f)\n",
|
||||||
|
"\n",
|
||||||
|
" nlp = spacy.blank(\"en\") # load a new spacy model\n",
|
||||||
|
" db = DocBin() # create a DocBin object\n",
|
||||||
|
"\n",
|
||||||
|
" for text, anno in tqdm(dataset): # data in previous format\n",
|
||||||
|
" doc = nlp.make_doc(text) # create doc object from text\n",
|
||||||
|
" ents = []\n",
|
||||||
|
" for start, end, label in anno[\"entities\"]:\n",
|
||||||
|
" span = doc.char_span(start, end, label=label, alignment_mode=\"contract\")\n",
|
||||||
|
" if span is None:\n",
|
||||||
|
" print(\"Skipping entity\")\n",
|
||||||
|
" else:\n",
|
||||||
|
" ents.append(span)\n",
|
||||||
|
" doc.ents = ents\n",
|
||||||
|
" db.add(doc)\n",
|
||||||
|
" save_path = save_dir / (file_path.stem + \".spacy\")\n",
|
||||||
|
" db.to_disk(save_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 7,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"100%|██████████| 11633/11633 [00:01<00:00, 7864.58it/s]\n",
|
||||||
|
"100%|██████████| 1293/1293 [00:00<00:00, 6090.11it/s]\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"convert_ner_ds_format(dataset_dir / \"ner_train_ds.json\", dataset_dir)\n",
|
||||||
|
"convert_ner_ds_format(dataset_dir / \"ner_test_ds.json\", dataset_dir)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"foo_idx = list(range(10))\n",
|
||||||
|
"print(foo_idx)\n",
|
||||||
|
"random.shuffle(foo_idx)\n",
|
||||||
|
"print(foo_idx)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(foo_idx[(1, 3, 5)])"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"foo_path = Path(\"../../data/ner_dataset\")\n",
|
||||||
|
"for item in foo_path.iterdir():\n",
|
||||||
|
" print(item)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(foo_path)"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": [
|
||||||
|
"print(foo_path.is_dir())"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": null,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [],
|
||||||
|
"source": []
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "tld",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
|
@ -0,0 +1,84 @@
|
||||||
|
{
|
||||||
|
"cells": [
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 1,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"name": "stderr",
|
||||||
|
"output_type": "stream",
|
||||||
|
"text": [
|
||||||
|
"/home/huaian/mambaforge/envs/mytrans/lib/python3.9/site-packages/tqdm/auto.py:21: TqdmWarning: IProgress not found. Please update jupyter and ipywidgets. See https://ipywidgets.readthedocs.io/en/stable/user_install.html\n",
|
||||||
|
" from .autonotebook import tqdm as notebook_tqdm\n"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"import spacy"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
{
|
||||||
|
"cell_type": "code",
|
||||||
|
"execution_count": 2,
|
||||||
|
"metadata": {},
|
||||||
|
"outputs": [
|
||||||
|
{
|
||||||
|
"data": {
|
||||||
|
"text/html": [
|
||||||
|
"<span class=\"tex2jax_ignore\"><div class=\"entities\" style=\"line-height: 2.5; direction: ltr\">Then use \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
|
||||||
|
" AJAX\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">CODE_ENT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" to submit the \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
|
||||||
|
" form\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">CODE_ENT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" and show results in the \n",
|
||||||
|
"<mark class=\"entity\" style=\"background: #ddd; padding: 0.45em 0.6em; margin: 0 0.25em; line-height: 1; border-radius: 0.35em;\">\n",
|
||||||
|
" #results\n",
|
||||||
|
" <span style=\"font-size: 0.8em; font-weight: bold; line-height: 1; border-radius: 0.35em; vertical-align: middle; margin-left: 0.5rem\">CODE_ENT</span>\n",
|
||||||
|
"</mark>\n",
|
||||||
|
" -container</div></span>"
|
||||||
|
],
|
||||||
|
"text/plain": [
|
||||||
|
"<IPython.core.display.HTML object>"
|
||||||
|
]
|
||||||
|
},
|
||||||
|
"metadata": {},
|
||||||
|
"output_type": "display_data"
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"source": [
|
||||||
|
"nlp = spacy.load(\"../model/model-best/\")\n",
|
||||||
|
"text = \"Then use AJAX to submit the form and show results in the #results -container\"\n",
|
||||||
|
"doc = nlp(text)\n",
|
||||||
|
"\n",
|
||||||
|
"spacy.displacy.render(doc, style=\"ent\", jupyter=True)"
|
||||||
|
]
|
||||||
|
}
|
||||||
|
],
|
||||||
|
"metadata": {
|
||||||
|
"kernelspec": {
|
||||||
|
"display_name": "mytrans",
|
||||||
|
"language": "python",
|
||||||
|
"name": "python3"
|
||||||
|
},
|
||||||
|
"language_info": {
|
||||||
|
"codemirror_mode": {
|
||||||
|
"name": "ipython",
|
||||||
|
"version": 3
|
||||||
|
},
|
||||||
|
"file_extension": ".py",
|
||||||
|
"mimetype": "text/x-python",
|
||||||
|
"name": "python",
|
||||||
|
"nbconvert_exporter": "python",
|
||||||
|
"pygments_lexer": "ipython3",
|
||||||
|
"version": "3.9.18"
|
||||||
|
}
|
||||||
|
},
|
||||||
|
"nbformat": 4,
|
||||||
|
"nbformat_minor": 2
|
||||||
|
}
|
|
@ -0,0 +1,6 @@
|
||||||
|
python -m spacy train ./config.cfg \
|
||||||
|
--output ../model \
|
||||||
|
--gpu-id 0 \
|
||||||
|
--paths.train ../data/ner_dataset/ner_train_ds.spacy \
|
||||||
|
--paths.dev ../data/ner_dataset/ner_test_ds.spacy \
|
||||||
|
--system.seed 42
|
File diff suppressed because it is too large
Load Diff
|
@ -0,0 +1,11 @@
|
||||||
|
# Data format:
|
||||||
|
|
||||||
|
In datasets are represented in the Conll format. In this format each line of the is in the following format:
|
||||||
|
|
||||||
|
<word>+"\t"+<NE>"\t"+<word>+"\t"<markdown>
|
||||||
|
|
||||||
|
The end of sentence is marked with an empty line.
|
||||||
|
|
||||||
|
In each line `NE` represented the human annotated named entity and `<markdown>` represented the code tags provided by the users who wrote the posts.
|
||||||
|
|
||||||
|
|
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
Binary file not shown.
File diff suppressed because it is too large
Load Diff
File diff suppressed because it is too large
Load Diff
Loading…
Reference in New Issue