From f6e03a51f0d958e772027d788badc395137f9f12 Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Fri, 29 Jan 2021 03:15:03 +0000 Subject: [PATCH 1/8] upload rare code --- configs/rec/rec_mv3_tps_bilstm_att.yml | 102 +++++++++++ configs/rec/rec_r34_vd_tps_bilstm_att.yml | 103 +++++++++++ ppocr/data/imaug/label_ops.py | 24 ++- ppocr/losses/__init__.py | 5 +- ppocr/losses/rec_att_loss.py | 39 ++++ ppocr/modeling/heads/__init__.py | 5 +- ppocr/modeling/heads/rec_att_head.py | 211 ++++++++++++++++++++++ ppocr/postprocess/__init__.py | 3 +- ppocr/postprocess/rec_postprocess.py | 46 ++++- 9 files changed, 525 insertions(+), 13 deletions(-) create mode 100644 configs/rec/rec_mv3_tps_bilstm_att.yml create mode 100644 configs/rec/rec_r34_vd_tps_bilstm_att.yml create mode 100644 ppocr/losses/rec_att_loss.py create mode 100644 ppocr/modeling/heads/rec_att_head.py diff --git a/configs/rec/rec_mv3_tps_bilstm_att.yml b/configs/rec/rec_mv3_tps_bilstm_att.yml new file mode 100644 index 00000000..c64b2ccc --- /dev/null +++ b/configs/rec/rec_mv3_tps_bilstm_att.yml @@ -0,0 +1,102 @@ +Global: + use_gpu: true + epoch_num: 72 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/rec_mv3_tps_bilstm_att/ + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0.00001 + +Architecture: + model_type: rec + algorithm: RARE + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 0.1 + model_name: small + Backbone: + name: MobileNetV3 + scale: 0.5 + model_name: large + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 96 + Head: + name: AttentionHead + hidden_size: 96 + + +Loss: + name: AttentionLoss + +PostProcess: + name: AttnLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDateSet + data_dir: ../training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDateSet + data_dir: ../validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 1 diff --git a/configs/rec/rec_r34_vd_tps_bilstm_att.yml b/configs/rec/rec_r34_vd_tps_bilstm_att.yml new file mode 100644 index 00000000..f42bfdcc --- /dev/null +++ b/configs/rec/rec_r34_vd_tps_bilstm_att.yml @@ -0,0 +1,103 @@ +Global: + use_gpu: true + epoch_num: 400 + log_smooth_window: 20 + print_batch_step: 10 + save_model_dir: ./output/rec/b3_rare_r34_none_gru/ + save_epoch_step: 3 + # evaluation is run every 5000 iterations after the 4000th iteration + eval_batch_step: [0, 2000] + # if pretrained_model is saved in static mode, load_static_weights must set to True + cal_metric_during_train: True + pretrained_model: + checkpoints: + save_inference_dir: + use_visualdl: False + infer_img: doc/imgs_words/ch/word_1.jpg + # for data or label process + character_dict_path: + character_type: en + max_text_length: 25 + infer_mode: False + use_space_char: False + + +Optimizer: + name: Adam + beta1: 0.9 + beta2: 0.999 + lr: + learning_rate: 0.0005 + regularizer: + name: 'L2' + factor: 0.00000 + +Architecture: + model_type: rec + algorithm: RARE + Transform: + name: TPS + num_fiducial: 20 + loc_lr: 0.1 + model_name: large + Backbone: + name: ResNet + layers: 34 + + Neck: + name: SequenceEncoder + encoder_type: rnn + hidden_size: 256 #96 + Head: + name: AttentionHead # AttentionHead + hidden_size: 256 # + l2_decay: 0.00001 + + +Loss: + name: AttentionLoss + +PostProcess: + name: AttnLabelDecode + +Metric: + name: RecMetric + main_indicator: acc + +Train: + dataset: + name: LMDBDateSet + data_dir: ../training/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: True + batch_size_per_card: 256 + drop_last: True + num_workers: 8 + +Eval: + dataset: + name: LMDBDateSet + data_dir: ../validation/ + transforms: + - DecodeImage: # load image + img_mode: BGR + channel_first: False + - AttnLabelEncode: # Class handling label + - RecResizeImg: + image_shape: [3, 32, 100] + - KeepKeys: + keep_keys: ['image', 'label', 'length'] # dataloader will return list in this order + loader: + shuffle: False + drop_last: False + batch_size_per_card: 256 + num_workers: 8 diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 14c1cc9c..6d9ea190 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -197,16 +197,30 @@ class AttnLabelEncode(BaseRecLabelEncode): super(AttnLabelEncode, self).__init__(max_text_length, character_dict_path, character_type, use_space_char) - self.beg_str = "sos" - self.end_str = "eos" def add_special_char(self, dict_character): - dict_character = [self.beg_str, self.end_str] + dict_character + self.beg_str = "sos" + self.end_str = "eos" + dict_character = [self.beg_str] + dict_character + [self.end_str] return dict_character - def __call__(self, text): + def __call__(self, data): + text = data['label'] text = self.encode(text) - return text + if text is None: + return None + if len(text) > self.max_text_len: + return None + data['length'] = np.array(len(text)) + text = [0] + text + [len(self.character) - 1] + [0] * (self.max_text_len + - len(text) - 1) + data['label'] = np.array(text) + return data + + def get_ignored_tokens(self): + beg_idx = self.get_beg_end_flag_idx("beg") + end_idx = self.get_beg_end_flag_idx("end") + return [beg_idx, end_idx] def get_beg_end_flag_idx(self, beg_or_end): if beg_or_end == "beg": diff --git a/ppocr/losses/__init__.py b/ppocr/losses/__init__.py index 4673d35c..94314235 100755 --- a/ppocr/losses/__init__.py +++ b/ppocr/losses/__init__.py @@ -23,11 +23,14 @@ def build_loss(config): # rec loss from .rec_ctc_loss import CTCLoss + from .rec_att_loss import AttentionLoss # cls loss from .cls_loss import ClsLoss - support_dict = ['DBLoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss'] + support_dict = [ + 'DBLoss', 'EASTLoss', 'SASTLoss', 'CTCLoss', 'ClsLoss', 'AttentionLoss' + ] config = copy.deepcopy(config) module_name = config.pop('name') diff --git a/ppocr/losses/rec_att_loss.py b/ppocr/losses/rec_att_loss.py new file mode 100644 index 00000000..6e2f6748 --- /dev/null +++ b/ppocr/losses/rec_att_loss.py @@ -0,0 +1,39 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +from paddle import nn + + +class AttentionLoss(nn.Layer): + def __init__(self, **kwargs): + super(AttentionLoss, self).__init__() + self.loss_func = nn.CrossEntropyLoss(weight=None, reduction='none') + + def forward(self, predicts, batch): + targets = batch[1].astype("int64") + label_lengths = batch[2].astype('int64') + batch_size, num_steps, num_classes = predicts.shape[0], predicts.shape[ + 1], predicts.shape[2] + assert len(targets.shape) == len(list(predicts.shape)) - 1, \ + "The target's shape and inputs's shape is [N, d] and [N, num_steps]" + + inputs = paddle.reshape(predicts, [-1, predicts.shape[-1]]) + targets = paddle.reshape(targets, [-1]) + + return {'loss': paddle.sum(self.loss_func(inputs, targets))} diff --git a/ppocr/modeling/heads/__init__.py b/ppocr/modeling/heads/__init__.py index 78074709..29d0ba80 100755 --- a/ppocr/modeling/heads/__init__.py +++ b/ppocr/modeling/heads/__init__.py @@ -23,10 +23,13 @@ def build_head(config): # rec head from .rec_ctc_head import CTCHead + from .rec_att_head import AttentionHead # cls head from .cls_head import ClsHead - support_dict = ['DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead'] + support_dict = [ + 'DBHead', 'EASTHead', 'SASTHead', 'CTCHead', 'ClsHead', 'AttentionHead' + ] module_name = config.pop('name') assert module_name in support_dict, Exception('head only support {}'.format( diff --git a/ppocr/modeling/heads/rec_att_head.py b/ppocr/modeling/heads/rec_att_head.py new file mode 100644 index 00000000..d01f0e6c --- /dev/null +++ b/ppocr/modeling/heads/rec_att_head.py @@ -0,0 +1,211 @@ +# copyright (c) 2021 PaddlePaddle Authors. All Rights Reserve. +# +# Licensed under the Apache License, Version 2.0 (the "License"); +# you may not use this file except in compliance with the License. +# You may obtain a copy of the License at +# +# http://www.apache.org/licenses/LICENSE-2.0 +# +# Unless required by applicable law or agreed to in writing, software +# distributed under the License is distributed on an "AS IS" BASIS, +# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied. +# See the License for the specific language governing permissions and +# limitations under the License. + +from __future__ import absolute_import +from __future__ import division +from __future__ import print_function + +import paddle +import paddle.nn as nn +import paddle.nn.functional as F +import numpy as np +from paddle.jit import to_static + + +class AttentionHead(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionHead, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionGRUCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = paddle.zeros((batch_size, self.hidden_size)) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + output_hiddens.append(paddle.unsqueeze(outputs, axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + (outputs, hidden), alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(outputs) + probs = paddle.unsqueeze( + probs_step, axis=1) if probs is None else paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + next_input = probs_step.argmax(axis=1) + targets = next_input + + return probs + + +class AttentionGRUCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionGRUCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden), axis=1) + + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha + + +class AttentionLSTM(nn.Layer): + def __init__(self, in_channels, out_channels, hidden_size, **kwargs): + super(AttentionLSTM, self).__init__() + self.input_size = in_channels + self.hidden_size = hidden_size + self.num_classes = out_channels + + self.attention_cell = AttentionLSTMCell( + in_channels, hidden_size, out_channels, use_gru=False) + self.generator = nn.Linear(hidden_size, out_channels) + + def _char_to_onehot(self, input_char, onehot_dim): + input_ont_hot = F.one_hot(input_char, onehot_dim) + return input_ont_hot + + def forward(self, inputs, targets=None, batch_max_length=25): + batch_size = inputs.shape[0] + num_steps = batch_max_length + + hidden = (paddle.zeros((batch_size, self.hidden_size)), paddle.zeros( + (batch_size, self.hidden_size))) + output_hiddens = [] + + if targets is not None: + for i in range(num_steps): + # one-hot vectors for a i-th char + char_onehots = self._char_to_onehot( + targets[:, i], onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + + hidden = (hidden[1][0], hidden[1][1]) + output_hiddens.append(paddle.unsqueeze(hidden[0], axis=1)) + output = paddle.concat(output_hiddens, axis=1) + probs = self.generator(output) + + else: + targets = paddle.zeros(shape=[batch_size], dtype="int32") + probs = None + + for i in range(num_steps): + char_onehots = self._char_to_onehot( + targets, onehot_dim=self.num_classes) + hidden, alpha = self.attention_cell(hidden, inputs, + char_onehots) + probs_step = self.generator(hidden[0]) + hidden = (hidden[1][0], hidden[1][1]) + probs = paddle.unsqueeze( + probs_step, axis=1) if probs is None else paddle.concat( + [probs, paddle.unsqueeze( + probs_step, axis=1)], axis=1) + + next_input = probs_step.argmax(axis=1) + + targets = next_input + + return probs + + +class AttentionLSTMCell(nn.Layer): + def __init__(self, input_size, hidden_size, num_embeddings, use_gru=False): + super(AttentionLSTMCell, self).__init__() + self.i2h = nn.Linear(input_size, hidden_size, bias_attr=False) + self.h2h = nn.Linear(hidden_size, hidden_size) + self.score = nn.Linear(hidden_size, 1, bias_attr=False) + if not use_gru: + self.rnn = nn.LSTMCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + else: + self.rnn = nn.GRUCell( + input_size=input_size + num_embeddings, hidden_size=hidden_size) + + self.hidden_size = hidden_size + + def forward(self, prev_hidden, batch_H, char_onehots): + batch_H_proj = self.i2h(batch_H) + prev_hidden_proj = paddle.unsqueeze(self.h2h(prev_hidden[0]), axis=1) + res = paddle.add(batch_H_proj, prev_hidden_proj) + res = paddle.tanh(res) + e = self.score(res) + + alpha = F.softmax(e, axis=1) + alpha = paddle.transpose(alpha, [0, 2, 1]) + context = paddle.squeeze(paddle.mm(alpha, batch_H), axis=1) + concat_context = paddle.concat([context, char_onehots], 1) + cur_hidden = self.rnn(concat_context, prev_hidden) + + return cur_hidden, alpha + + +if __name__ == '__main__': + paddle.disable_static() + + model = Attention(100, 200, 10) + + x = np.random.uniform(-1, 1, [2, 10, 100]).astype(np.float32) + y = np.random.randint(0, 10, [2, 21]).astype(np.int32) + + xp = paddle.to_tensor(x) + yp = paddle.to_tensor(y) + + res = model(inputs=xp, targets=yp, is_train=True, batch_max_length=20) + print("res: ", res.shape) diff --git a/ppocr/postprocess/__init__.py b/ppocr/postprocess/__init__.py index c9b42e08..2b8d00a9 100644 --- a/ppocr/postprocess/__init__.py +++ b/ppocr/postprocess/__init__.py @@ -30,7 +30,8 @@ def build_post_process(config, global_config=None): from .cls_postprocess import ClsPostProcess support_dict = [ - 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', 'AttnLabelDecode', 'ClsPostProcess' + 'DBPostProcess', 'EASTPostProcess', 'SASTPostProcess', 'CTCLabelDecode', + 'AttnLabelDecode', 'ClsPostProcess', 'AttnLabelDecode' ] config = copy.deepcopy(config) diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 65ed4671..1ac35246 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -133,16 +133,52 @@ class AttnLabelDecode(BaseRecLabelDecode): **kwargs): super(AttnLabelDecode, self).__init__(character_dict_path, character_type, use_space_char) - self.beg_str = "sos" - self.end_str = "eos" def add_special_char(self, dict_character): - dict_character = [self.beg_str, self.end_str] + dict_character + self.beg_str = "sos" + self.end_str = "eos" + dict_character = dict_character + dict_character = [self.beg_str] + dict_character + [self.end_str] return dict_character - def __call__(self, text): + def __call__(self, preds, label=None, *args, **kwargs): + """ text = self.decode(text) - return text + if label is None: + return text + else: + label = self.decode(label, is_remove_duplicate=False) + return text, label + """ + if isinstance(preds, paddle.Tensor): + preds = preds.numpy() + + preds_idx = preds.argmax(axis=2) + preds_prob = preds.max(axis=2) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + if label is None: + return text + label = self.decode(label, is_remove_duplicate=True) + return text, label + + def encoder(self, labels, labels_length): + """ + used to encoder labels readed from LMDB dataset, forexample: + [35, 25, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] encode to + 'you': [0, 35,25,31, 37, 0, ...] 'sos'you'eos' + """ + if isinstance(labels, paddle.Tensor): + labels = labels.numpy() + batch_max_length = labels.shape[ + 1] + 2 # add start token 'sos' and end token 'eos' + new_labels = np.zeros( + [labels.shape[0], batch_max_length]).astype(np.int64) + for i in range(labels.shape[0]): + new_labels[i, 1:1 + labels_length[i]] = labels[i, :labels_length[ + i]] # new_labels[i, 0] = 'sos' token + new_labels[i, labels_length[i] + 1] = len( + self.character) - 1 # add end charactor 'eos' token + return new_labels def get_ignored_tokens(self): beg_idx = self.get_beg_end_flag_idx("beg") From f896032255566ffa3194e0d8f3843e7131a13ca7 Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Mon, 1 Feb 2021 03:12:38 +0000 Subject: [PATCH 2/8] pre-commit --- tools/infer/predict_system.py | 2 +- 1 file changed, 1 insertion(+), 1 deletion(-) diff --git a/tools/infer/predict_system.py b/tools/infer/predict_system.py index 8c4f9214..de7ee9d3 100755 --- a/tools/infer/predict_system.py +++ b/tools/infer/predict_system.py @@ -184,4 +184,4 @@ def main(args): if __name__ == "__main__": - main(utility.parse_args()) \ No newline at end of file + main(utility.parse_args()) From 7a054c854b8253a2a088c107e55de20a3f207a26 Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Mon, 1 Feb 2021 06:27:56 +0000 Subject: [PATCH 3/8] rare doc and opt post_process --- doc/doc_ch/algorithm_overview.md | 5 +++- doc/doc_ch/recognition.md | 2 ++ doc/doc_en/algorithm_overview_en.md | 4 ++- doc/doc_en/recognition_en.md | 3 +++ ppocr/postprocess/rec_postprocess.py | 37 +++++++++++++++++++++++++--- 5 files changed, 45 insertions(+), 6 deletions(-) diff --git a/doc/doc_ch/algorithm_overview.md b/doc/doc_ch/algorithm_overview.md index abbc5da4..4ff7482c 100755 --- a/doc/doc_ch/algorithm_overview.md +++ b/doc/doc_ch/algorithm_overview.md @@ -40,7 +40,7 @@ PaddleOCR基于动态图开源的文本识别算法列表: - [x] CRNN([paper](https://arxiv.org/abs/1507.05717))[7](ppocr推荐) - [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))[10] - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] -- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] coming soon +- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] - [x] SRN([paper](https://arxiv.org/abs/2003.12294))[5] 参考[DTRB][3](https://arxiv.org/abs/1904.01906)文字识别训练和评估流程,使用MJSynth和SynthText两个文字识别数据集训练,在IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE数据集上进行评估,算法效果如下: @@ -53,6 +53,9 @@ PaddleOCR基于动态图开源的文本识别算法列表: |CRNN|MobileNetV3|79.97%|rec_mv3_none_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)| |StarNet|Resnet34_vd|84.44%|rec_r34_vd_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)| |StarNet|MobileNetV3|81.42%|rec_mv3_tps_bilstm_ctc|[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)| +|RARE|MobileNetV3|82.5|rec_mv3_tps_bilstm_att||[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| +|RARE|Resnet34_vd|83.6|rec_r34_vd_tps_bilstm_att||[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| |SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn | [下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar) | + PaddleOCR文本识别算法的训练和使用请参考文档教程中[模型训练/评估中的文本识别部分](./recognition.md)。 diff --git a/doc/doc_ch/recognition.md b/doc/doc_ch/recognition.md index bc877ab7..f36e8019 100644 --- a/doc/doc_ch/recognition.md +++ b/doc/doc_ch/recognition.md @@ -201,6 +201,8 @@ PaddleOCR支持训练和评估交替进行, 可以在 `configs/rec/rec_icdar15_t | rec_mv3_none_none_ctc.yml | Rosetta | Mobilenet_v3 large 0.5 | None | None | ctc | | rec_r34_vd_none_bilstm_ctc.yml | CRNN | Resnet34_vd | None | BiLSTM | ctc | | rec_r34_vd_none_none_ctc.yml | Rosetta | Resnet34_vd | None | None | ctc | +| rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att | +| rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | 训练中文数据,推荐使用[rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml),如您希望尝试其他算法在中文数据集上的效果,请参考下列说明修改配置文件: diff --git a/doc/doc_en/algorithm_overview_en.md b/doc/doc_en/algorithm_overview_en.md index 7d7896e7..423fe807 100755 --- a/doc/doc_en/algorithm_overview_en.md +++ b/doc/doc_en/algorithm_overview_en.md @@ -42,7 +42,7 @@ PaddleOCR open-source text recognition algorithms list: - [x] CRNN([paper](https://arxiv.org/abs/1507.05717))[7] - [x] Rosetta([paper](https://arxiv.org/abs/1910.05085))[10] - [x] STAR-Net([paper](http://www.bmva.org/bmvc/2016/papers/paper043/index.html))[11] -- [ ] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] coming soon +- [x] RARE([paper](https://arxiv.org/abs/1603.03915v1))[12] - [x] SRN([paper](https://arxiv.org/abs/2003.12294))[5] Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation result of these above text recognition (using MJSynth and SynthText for training, evaluate on IIIT, SVT, IC03, IC13, IC15, SVTP, CUTE) is as follow: @@ -55,6 +55,8 @@ Refer to [DTRB](https://arxiv.org/abs/1904.01906), the training and evaluation r |CRNN|MobileNetV3|79.97%|rec_mv3_none_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_none_bilstm_ctc_v2.0_train.tar)| |StarNet|Resnet34_vd|84.44%|rec_r34_vd_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_ctc_v2.0_train.tar)| |StarNet|MobileNetV3|81.42%|rec_mv3_tps_bilstm_ctc|[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_ctc_v2.0_train.tar)| +|RARE|MobileNetV3|82.5|rec_mv3_tps_bilstm_att||[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_mv3_tps_bilstm_att_v2.0_train.tar)| +|RARE|Resnet34_vd|83.6|rec_r34_vd_tps_bilstm_att||[下载链接](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r34_vd_tps_bilstm_att_v2.0_train.tar)| |SRN|Resnet50_vd_fpn| 88.52% | rec_r50fpn_vd_none_srn |[Download link](https://paddleocr.bj.bcebos.com/dygraph_v2.0/en/rec_r50_vd_srn_train.tar)| Please refer to the document for training guide and use of PaddleOCR text recognition algorithms [Text recognition model training/evaluation/prediction](./recognition_en.md) diff --git a/doc/doc_en/recognition_en.md b/doc/doc_en/recognition_en.md index f29703d1..c2ff2022 100644 --- a/doc/doc_en/recognition_en.md +++ b/doc/doc_en/recognition_en.md @@ -195,8 +195,11 @@ If the evaluation set is large, the test will be time-consuming. It is recommend | rec_mv3_none_none_ctc.yml | Rosetta | Mobilenet_v3 large 0.5 | None | None | ctc | | rec_r34_vd_none_bilstm_ctc.yml | CRNN | Resnet34_vd | None | BiLSTM | ctc | | rec_r34_vd_none_none_ctc.yml | Rosetta | Resnet34_vd | None | None | ctc | +| rec_mv3_tps_bilstm_att.yml | CRNN | Mobilenet_v3 | TPS | BiLSTM | att | +| rec_r34_vd_tps_bilstm_att.yml | CRNN | Resnet34_vd | TPS | BiLSTM | att | | rec_r50fpn_vd_none_srn.yml | SRN | Resnet50_fpn_vd | None | rnn | srn | + For training Chinese data, it is recommended to use [rec_chinese_lite_train_v2.0.yml](../../configs/rec/ch_ppocr_v2.0/rec_chinese_lite_train_v2.0.yml). If you want to try the result of other algorithms on the Chinese data set, please refer to the following instructions to modify the configuration file: co diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index 2b82750f..d7e658f3 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -143,6 +143,35 @@ class AttnLabelDecode(BaseRecLabelDecode): dict_character = [self.beg_str] + dict_character + [self.end_str] return dict_character + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + """ convert text-index into text-label. """ + result_list = [] + ignored_tokens = self.get_ignored_tokens() + [beg_idx, end_idx] = self.get_ignored_tokens() + batch_size = len(text_index) + for batch_idx in range(batch_size): + char_list = [] + conf_list = [] + for idx in range(len(text_index[batch_idx])): + if text_index[batch_idx][idx] in ignored_tokens: + continue + if int(text_index[batch_idx][idx]) == int(end_idx): + break + if is_remove_duplicate: + # only for predict + if idx > 0 and text_index[batch_idx][idx - 1] == text_index[ + batch_idx][idx]: + continue + char_list.append(self.character[int(text_index[batch_idx][ + idx])]) + if text_prob is not None: + conf_list.append(text_prob[batch_idx][idx]) + else: + conf_list.append(1) + text = ''.join(char_list) + result_list.append((text, np.mean(conf_list))) + return result_list + def __call__(self, preds, label=None, *args, **kwargs): """ text = self.decode(text) @@ -157,10 +186,10 @@ class AttnLabelDecode(BaseRecLabelDecode): preds_idx = preds.argmax(axis=2) preds_prob = preds.max(axis=2) - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) if label is None: return text - label = self.decode(label, is_remove_duplicate=True) + label = self.decode(label, is_remove_duplicate=False) return text, label def encoder(self, labels, labels_length): @@ -226,12 +255,12 @@ class SRNLabelDecode(BaseRecLabelDecode): text = self.decode(preds_idx, preds_prob) if label is None: - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) return text label = self.decode(label) return text, label - def decode(self, text_index, text_prob=None, is_remove_duplicate=False): + def decode(self, text_index, text_prob=None, is_remove_duplicate=True): """ convert text-index into text-label. """ result_list = [] ignored_tokens = self.get_ignored_tokens() From 0f4d92b63f811db56561f0f7223909dbe56d4fe3 Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Mon, 1 Feb 2021 06:32:14 +0000 Subject: [PATCH 4/8] fix conflict wiith SRN --- ppocr/postprocess/rec_postprocess.py | 4 ++-- 1 file changed, 2 insertions(+), 2 deletions(-) diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index d7e658f3..d4991222 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -255,12 +255,12 @@ class SRNLabelDecode(BaseRecLabelDecode): text = self.decode(preds_idx, preds_prob) if label is None: - text = self.decode(preds_idx, preds_prob, is_remove_duplicate=True) + text = self.decode(preds_idx, preds_prob, is_remove_duplicate=False) return text label = self.decode(label) return text, label - def decode(self, text_index, text_prob=None, is_remove_duplicate=True): + def decode(self, text_index, text_prob=None, is_remove_duplicate=False): """ convert text-index into text-label. """ result_list = [] ignored_tokens = self.get_ignored_tokens() From e7d24ac8b87a76d36c1f0e022d450db633e00017 Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Mon, 1 Feb 2021 06:41:45 +0000 Subject: [PATCH 5/8] fix comment --- configs/rec/rec_r34_vd_tps_bilstm_att.yml | 2 -- ppocr/postprocess/rec_postprocess.py | 19 ------------------- 2 files changed, 21 deletions(-) diff --git a/configs/rec/rec_r34_vd_tps_bilstm_att.yml b/configs/rec/rec_r34_vd_tps_bilstm_att.yml index f42bfdcc..7be34b9c 100644 --- a/configs/rec/rec_r34_vd_tps_bilstm_att.yml +++ b/configs/rec/rec_r34_vd_tps_bilstm_att.yml @@ -43,7 +43,6 @@ Architecture: Backbone: name: ResNet layers: 34 - Neck: name: SequenceEncoder encoder_type: rnn @@ -52,7 +51,6 @@ Architecture: name: AttentionHead # AttentionHead hidden_size: 256 # l2_decay: 0.00001 - Loss: name: AttentionLoss diff --git a/ppocr/postprocess/rec_postprocess.py b/ppocr/postprocess/rec_postprocess.py index d4991222..af243caa 100644 --- a/ppocr/postprocess/rec_postprocess.py +++ b/ppocr/postprocess/rec_postprocess.py @@ -192,25 +192,6 @@ class AttnLabelDecode(BaseRecLabelDecode): label = self.decode(label, is_remove_duplicate=False) return text, label - def encoder(self, labels, labels_length): - """ - used to encoder labels readed from LMDB dataset, forexample: - [35, 25, 31, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0]] encode to - 'you': [0, 35,25,31, 37, 0, ...] 'sos'you'eos' - """ - if isinstance(labels, paddle.Tensor): - labels = labels.numpy() - batch_max_length = labels.shape[ - 1] + 2 # add start token 'sos' and end token 'eos' - new_labels = np.zeros( - [labels.shape[0], batch_max_length]).astype(np.int64) - for i in range(labels.shape[0]): - new_labels[i, 1:1 + labels_length[i]] = labels[i, :labels_length[ - i]] # new_labels[i, 0] = 'sos' token - new_labels[i, labels_length[i] + 1] = len( - self.character) - 1 # add end charactor 'eos' token - return new_labels - def get_ignored_tokens(self): beg_idx = self.get_beg_end_flag_idx("beg") end_idx = self.get_beg_end_flag_idx("end") From 550022ea663df53f62d199954c328493043ec1e0 Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Mon, 1 Feb 2021 06:44:04 +0000 Subject: [PATCH 6/8] fix comment --- ppocr/data/imaug/label_ops.py | 2 +- ppocr/modeling/heads/rec_att_head.py | 15 --------------- 2 files changed, 1 insertion(+), 16 deletions(-) diff --git a/ppocr/data/imaug/label_ops.py b/ppocr/data/imaug/label_ops.py index 191bda92..26ac4d81 100644 --- a/ppocr/data/imaug/label_ops.py +++ b/ppocr/data/imaug/label_ops.py @@ -211,7 +211,7 @@ class AttnLabelEncode(BaseRecLabelEncode): text = self.encode(text) if text is None: return None - if len(text) > self.max_text_len: + if len(text) >= self.max_text_len: return None data['length'] = np.array(len(text)) text = [0] + text + [len(self.character) - 1] + [0] * (self.max_text_len diff --git a/ppocr/modeling/heads/rec_att_head.py b/ppocr/modeling/heads/rec_att_head.py index d01f0e6c..9f065d61 100644 --- a/ppocr/modeling/heads/rec_att_head.py +++ b/ppocr/modeling/heads/rec_att_head.py @@ -194,18 +194,3 @@ class AttentionLSTMCell(nn.Layer): cur_hidden = self.rnn(concat_context, prev_hidden) return cur_hidden, alpha - - -if __name__ == '__main__': - paddle.disable_static() - - model = Attention(100, 200, 10) - - x = np.random.uniform(-1, 1, [2, 10, 100]).astype(np.float32) - y = np.random.randint(0, 10, [2, 21]).astype(np.int32) - - xp = paddle.to_tensor(x) - yp = paddle.to_tensor(y) - - res = model(inputs=xp, targets=yp, is_train=True, batch_max_length=20) - print("res: ", res.shape) From 0d89f3f91321811a77186f9a67ba10bb542a2efa Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Mon, 1 Feb 2021 06:54:56 +0000 Subject: [PATCH 7/8] fix comment --- ppocr/modeling/heads/rec_att_head.py | 1 - 1 file changed, 1 deletion(-) diff --git a/ppocr/modeling/heads/rec_att_head.py b/ppocr/modeling/heads/rec_att_head.py index 9f065d61..bfe37e7a 100644 --- a/ppocr/modeling/heads/rec_att_head.py +++ b/ppocr/modeling/heads/rec_att_head.py @@ -20,7 +20,6 @@ import paddle import paddle.nn as nn import paddle.nn.functional as F import numpy as np -from paddle.jit import to_static class AttentionHead(nn.Layer): From a094d2775560a6dbb6e18cd761b99edd238956c2 Mon Sep 17 00:00:00 2001 From: LDOUBLEV Date: Mon, 1 Feb 2021 08:08:18 +0000 Subject: [PATCH 8/8] opt rec_att_head --- ppocr/modeling/heads/rec_att_head.py | 12 ++++++++---- 1 file changed, 8 insertions(+), 4 deletions(-) diff --git a/ppocr/modeling/heads/rec_att_head.py b/ppocr/modeling/heads/rec_att_head.py index bfe37e7a..a7cfe128 100644 --- a/ppocr/modeling/heads/rec_att_head.py +++ b/ppocr/modeling/heads/rec_att_head.py @@ -64,8 +64,10 @@ class AttentionHead(nn.Layer): (outputs, hidden), alpha = self.attention_cell(hidden, inputs, char_onehots) probs_step = self.generator(outputs) - probs = paddle.unsqueeze( - probs_step, axis=1) if probs is None else paddle.concat( + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( [probs, paddle.unsqueeze( probs_step, axis=1)], axis=1) next_input = probs_step.argmax(axis=1) @@ -152,8 +154,10 @@ class AttentionLSTM(nn.Layer): char_onehots) probs_step = self.generator(hidden[0]) hidden = (hidden[1][0], hidden[1][1]) - probs = paddle.unsqueeze( - probs_step, axis=1) if probs is None else paddle.concat( + if probs is None: + probs = paddle.unsqueeze(probs_step, axis=1) + else: + probs = paddle.concat( [probs, paddle.unsqueeze( probs_step, axis=1)], axis=1)