upload PaddleOCR code

This commit is contained in:
LDOUBLEV 2020-05-10 16:26:57 +08:00
parent bc93c549fe
commit 338ba3ee4a
84 changed files with 15543 additions and 0 deletions

35
.pre-commit-config.yaml Normal file
View File

@ -0,0 +1,35 @@
- repo: https://github.com/PaddlePaddle/mirrors-yapf.git
sha: 0d79c0c469bab64f7229c9aca2b1186ef47f0e37
hooks:
- id: yapf
files: \.py$
- repo: https://github.com/pre-commit/pre-commit-hooks
sha: a11d9314b22d8f8c7556443875b731ef05965464
hooks:
- id: check-merge-conflict
- id: check-symlinks
- id: detect-private-key
files: (?!.*paddle)^.*$
- id: end-of-file-fixer
files: \.md$
- id: trailing-whitespace
files: \.md$
- repo: https://github.com/Lucas-C/pre-commit-hooks
sha: v1.0.1
hooks:
- id: forbid-crlf
files: \.md$
- id: remove-crlf
files: \.md$
- id: forbid-tabs
files: \.md$
- id: remove-tabs
files: \.md$
- repo: local
hooks:
- id: clang-format
name: clang-format
description: Format files with ClangFormat
entry: bash .clang_format.hook -i
language: system
files: \.(c|cc|cxx|cpp|cu|h|hpp|hxx|cuh|proto)$

3
.style.yapf Normal file
View File

@ -0,0 +1,3 @@
[style]
based_on_style = pep8
column_limit = 80

View File

@ -0,0 +1,22 @@
TrainReader:
reader_function: ppocr.data.det.dataset_traversal,TrainReader
process_function: ppocr.data.det.db_process,DBProcessTrain
num_workers: 8
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
EvalReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.db_process,DBProcessTest
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
test_image_shape: [736, 1280]
TestReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.db_process,DBProcessTest
single_img_path:
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
test_image_shape: [736, 1280]
do_eval: True

51
configs/det/det_db_mv3.yml Executable file
View File

@ -0,0 +1,51 @@
Global:
algorithm: DB
use_gpu: true
epoch_num: 1200
log_smooth_window: 20
print_batch_step: 2
save_model_dir: output
save_epoch_step: 200
eval_batch_step: 5000
train_batch_size_per_card: 16
test_batch_size_per_card: 16
image_shape: [3, 640, 640]
reader_yml: ./configs/det/det_db_icdar15_reader.yml
pretrain_weights: ./pretrain_models/MobileNetV3_pretrained/MobileNetV3_large_x0_5_pretrained/
save_res_path: ./output/predicts_db.txt
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.det_db_head,DBHead
model_name: large
k: 50
inner_channels: 96
out_channels: 2
Loss:
function: ppocr.modeling.losses.det_db_loss,DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.db_postprocess,DBPostProcess
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5

51
configs/det/det_db_r50_vd.yml Executable file
View File

@ -0,0 +1,51 @@
Global:
algorithm: DB
use_gpu: true
epoch_num: 1200
log_smooth_window: 20
print_batch_step: 2
save_model_dir: output
save_epoch_step: 200
eval_batch_step: 5000
train_batch_size_per_card: 8
test_batch_size_per_card: 16
image_shape: [3, 640, 640]
reader_yml: ./configs/det/det_db_icdar15_reader.yml
pretrain_weights: ./pretrain_models/ResNet50_vd_pretrained/
save_res_path: ./output/predicts_db.txt
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_resnet_vd,ResNet
layers: 50
Head:
function: ppocr.modeling.heads.det_db_head,DBHead
model_name: large
k: 50
inner_channels: 256
out_channels: 2
Loss:
function: ppocr.modeling.losses.det_db_loss,DBLoss
balance_loss: true
main_loss_type: DiceLoss
alpha: 5
beta: 10
ohem_ratio: 3
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.db_postprocess,DBPostProcess
thresh: 0.3
box_thresh: 0.7
max_candidates: 1000
unclip_ratio: 1.5

View File

@ -0,0 +1,23 @@
TrainReader:
reader_function: ppocr.data.det.dataset_traversal,TrainReader
process_function: ppocr.data.det.east_process,EASTProcessTrain
num_workers: 8
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/train_icdar2015_label.txt
background_ratio: 0.125
min_crop_side_ratio: 0.1
min_text_size: 10
EvalReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.east_process,EASTProcessTest
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
TestReader:
reader_function: ppocr.data.det.dataset_traversal,EvalTestReader
process_function: ppocr.data.det.east_process,EASTProcessTest
single_img_path:
img_set_dir: ./train_data/icdar2015/text_localization/
label_file_path: ./train_data/icdar2015/text_localization/test_icdar2015_label.txt
do_eval: True

43
configs/det/det_east_mv3.yml Executable file
View File

@ -0,0 +1,43 @@
Global:
algorithm: EAST
use_gpu: true
epoch_num: 100000
log_smooth_window: 20
print_batch_step: 5
save_model_dir: output
save_epoch_step: 200
eval_batch_step: 5000
train_batch_size_per_card: 16
test_batch_size_per_card: 16
image_shape: [3, 512, 512]
reader_yml: ./configs/det/det_east_icdar15_reader.yml
pretrain_weights: ./pretrain_models/MobileNetV3_pretrained/MobileNetV3_large_x0_5_pretrained/
save_res_path: ./output/predicts_east.txt
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.det_east_head,EASTHead
model_name: small
Loss:
function: ppocr.modeling.losses.det_east_loss,EASTLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.east_postprocess,EASTPostPocess
score_thresh: 0.8
cover_thresh: 0.1
nms_thresh: 0.2

42
configs/det/det_east_r50_vd.yml Executable file
View File

@ -0,0 +1,42 @@
Global:
algorithm: EAST
use_gpu: true
epoch_num: 100000
log_smooth_window: 20
print_batch_step: 5
save_model_dir: output
save_epoch_step: 200
eval_batch_step: 5000
train_batch_size_per_card: 8
test_batch_size_per_card: 16
image_shape: [3, 512, 512]
reader_yml: ./configs/det/det_east_icdar15_reader.yml
pretrain_weights: ./pretrain_models/ResNet50_vd_pretrained/
save_res_path: ./output/predicts_east.txt
Architecture:
function: ppocr.modeling.architectures.det_model,DetModel
Backbone:
function: ppocr.modeling.backbones.det_resnet_vd,ResNet
layers: 50
Head:
function: ppocr.modeling.heads.det_east_head,EASTHead
model_name: large
Loss:
function: ppocr.modeling.losses.det_east_loss,EASTLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999
PostProcess:
function: ppocr.postprocess.east_postprocess,EASTPostPocess
score_thresh: 0.8
cover_thresh: 0.1
nms_thresh: 0.2

View File

@ -0,0 +1,12 @@
TrainReader:
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
num_workers: 8
lmdb_sets_dir: ./train_data/data_lmdb_release/training/
EvalReader:
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
lmdb_sets_dir: ./train_data/data_lmdb_release/validation/
TestReader:
reader_function: ppocr.data.rec.dataset_traversal,LMDBReader
lmdb_sets_dir: ./train_data/data_lmdb_release/evaluation/

View File

@ -0,0 +1,42 @@
Global:
algorithm: CRNN
dataset: common
use_gpu: true
epoch_num: 300
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: ch
character_dict_path: ./ppocr/utils/ppocr_keys_v1.txt
loss_type: ctc
reader_yml: ./configs/rec/rec_chinese_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: small
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 48
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,14 @@
TrainReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
num_workers: 8
img_set_dir: .
label_file_path: ./train_data/hard_label.txt
EvalReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
img_set_dir: .
label_file_path: ./train_data/label_val_all.txt
TestReader:
reader_function: ppocr.data.rec.dataset_traversal,SimpleReader
infer_img: ./infer_img

View File

@ -0,0 +1,40 @@
Global:
algorithm: CRNN
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 96
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,38 @@
Global:
algorithm: Rosetta
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: reshape
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,49 @@
Global:
algorithm: RARE
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: attention
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: small
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_attention_head,AttentionPredict
encoder_type: rnn
SeqRNN:
hidden_size: 96
Attention:
decoder_size: 96
word_vector_dim: 96
Loss:
function: ppocr.modeling.losses.rec_attention_loss,AttentionLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,46 @@
Global:
algorithm: STARNet
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: small
Backbone:
function: ppocr.modeling.backbones.rec_mobilenet_v3,MobileNetV3
scale: 0.5
model_name: large
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 96
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,39 @@
Global:
algorithm: CRNN
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 256
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,37 @@
Global:
algorithm: Rosetta
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: reshape
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,48 @@
Global:
algorithm: RARE
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: attention
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: large
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_attention_head,AttentionPredict
encoder_type: rnn
SeqRNN:
hidden_size: 256
Attention:
decoder_size: 128
word_vector_dim: 128
Loss:
function: ppocr.modeling.losses.rec_attention_loss,AttentionLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

View File

@ -0,0 +1,45 @@
Global:
algorithm: STARNet
use_gpu: true
epoch_num: 72
log_smooth_window: 20
print_batch_step: 10
save_model_dir: output
save_epoch_step: 3
eval_batch_step: 2000
train_batch_size_per_card: 256
test_batch_size_per_card: 256
image_shape: [3, 32, 100]
max_text_length: 25
character_type: en
loss_type: ctc
reader_yml: ./configs/rec/rec_benchmark_reader.yml
pretrain_weights:
Architecture:
function: ppocr.modeling.architectures.rec_model,RecModel
TPS:
function: ppocr.modeling.stns.tps,TPS
num_fiducial: 20
loc_lr: 0.1
model_name: large
Backbone:
function: ppocr.modeling.backbones.rec_resnet_vd,ResNet
layers: 34
Head:
function: ppocr.modeling.heads.rec_ctc_head,CTCPredict
encoder_type: rnn
SeqRNN:
hidden_size: 256
Loss:
function: ppocr.modeling.losses.rec_ctc_loss,CTCLoss
Optimizer:
function: ppocr.optimizer,AdamDecay
base_lr: 0.001
beta1: 0.9
beta2: 0.999

13
ppocr/__init__.py Executable file
View File

@ -0,0 +1,13 @@
# Copyright (c) 2019 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

13
ppocr/data/__init__.py Executable file
View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,47 @@
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import random
import cv2
import math
import imgaug
import imgaug.augmenters as iaa
def AugmentData(data):
img = data['image']
shape = img.shape
aug = iaa.Sequential(
[iaa.Fliplr(0.5), iaa.Affine(rotate=(-10, 10)), iaa.Resize(
(0.5, 3))]).to_deterministic()
def may_augment_annotation(aug, data, shape):
if aug is None:
return data
line_polys = []
for poly in data['polys']:
new_poly = may_augment_poly(aug, shape, poly)
line_polys.append(new_poly)
data['polys'] = np.array(line_polys)
return data
def may_augment_poly(aug, img_shape, poly):
keypoints = [imgaug.Keypoint(p[0], p[1]) for p in poly]
keypoints = aug.augment_keypoints(
[imgaug.KeypointsOnImage(
keypoints, shape=img_shape)])[0].keypoints
poly = [(p.x, p.y) for p in keypoints]
return poly
img_aug = aug.augment_image(img)
data['image'] = img_aug
data = may_augment_annotation(aug, data, shape)
return data

View File

@ -0,0 +1,110 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import math
import random
import functools
import numpy as np
import cv2
import string
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.utility import create_module
import time
class TrainReader(object):
def __init__(self, params):
self.num_workers = params['num_workers']
self.label_file_path = params['label_file_path']
self.batch_size = params['train_batch_size_per_card']
assert 'process_function' in params,\
"absence process_function in Reader"
self.process = create_module(params['process_function'])(params)
def __call__(self, process_id):
def sample_iter_reader():
with open(self.label_file_path, "rb") as fin:
label_infor_list = fin.readlines()
img_num = len(label_infor_list)
img_id_list = list(range(img_num))
random.shuffle(img_id_list)
for img_id in range(process_id, img_num, self.num_workers):
label_infor = label_infor_list[img_id_list[img_id]]
outs = self.process(label_infor)
if outs is None:
continue
yield outs
def batch_iter_reader():
batch_outs = []
for outs in sample_iter_reader():
batch_outs.append(outs)
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
return batch_iter_reader
class EvalTestReader(object):
def __init__(self, params):
self.params = params
assert 'process_function' in params,\
"absence process_function in EvalTestReader"
def __call__(self, mode):
process_function = create_module(self.params['process_function'])(
self.params)
batch_size = self.params['test_batch_size_per_card']
flag_test_single_img = False
if mode == "test":
single_img_path = self.params['single_img_path']
if single_img_path is not None:
flag_test_single_img = True
img_list = []
if flag_test_single_img:
img_list.append([single_img_path, single_img_path])
else:
img_set_dir = self.params['img_set_dir']
img_name_list_path = self.params['label_file_path']
with open(img_name_list_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
img_name = line.decode().strip("\n").split("\t")[0]
img_path = img_set_dir + "/" + img_name
img_list.append([img_path, img_name])
def batch_iter_reader():
batch_outs = []
for img_path, img_name in img_list:
img = cv2.imread(img_path)
if img is None:
logger.info("load image error:" + img_path)
continue
outs = process_function(img)
outs.append(img_name)
batch_outs.append(outs)
if len(batch_outs) == batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
return batch_iter_reader

View File

@ -0,0 +1,192 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import math
import cv2
import numpy as np
import json
import sys
from .data_augment import AugmentData
from .random_crop_data import RandomCropData
from .make_shrink_map import MakeShrinkMap
from .make_border_map import MakeBorderMap
class DBProcessTrain(object):
def __init__(self, params):
self.img_set_dir = params['img_set_dir']
self.image_shape = params['image_shape']
def order_points_clockwise(self, pts):
rect = np.zeros((4, 2), dtype="float32")
s = pts.sum(axis=1)
rect[0] = pts[np.argmin(s)]
rect[2] = pts[np.argmax(s)]
diff = np.diff(pts, axis=1)
rect[1] = pts[np.argmin(diff)]
rect[3] = pts[np.argmax(diff)]
return rect
def make_data_dict(self, imgvalue, entry):
boxes = []
texts = []
ignores = []
for rect in entry:
points = rect['points']
transcription = rect['transcription']
try:
box = self.order_points_clockwise(
np.array(points).reshape(-1, 2))
if cv2.contourArea(box) > 0:
boxes.append(box)
texts.append(transcription)
ignores.append(transcription in ['*', '###'])
except:
print('load label failed!')
data = {
'image': imgvalue,
'shape': [imgvalue.shape[0], imgvalue.shape[1]],
'polys': np.array(boxes),
'texts': texts,
'ignore_tags': ignores,
}
return data
def NormalizeImage(self, data):
im = data['image']
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
im = im.astype(np.float32, copy=False)
im = im / 255
im -= img_mean
im /= img_std
channel_swap = (2, 0, 1)
im = im.transpose(channel_swap)
data['image'] = im
return data
def FilterKeys(self, data):
filter_keys = ['polys', 'texts', 'ignore_tags', 'shape']
for key in filter_keys:
if key in data:
del data[key]
return data
def convert_label_infor(self, label_infor):
label_infor = label_infor.decode()
label_infor = label_infor.encode('utf-8').decode('utf-8-sig')
substr = label_infor.strip("\n").split("\t")
img_path = self.img_set_dir + substr[0]
label = json.loads(substr[1])
return img_path, label
def __call__(self, label_infor):
img_path, gt_label = self.convert_label_infor(label_infor)
imgvalue = cv2.imread(img_path)
if imgvalue is None:
return None
data = self.make_data_dict(imgvalue, gt_label)
data = AugmentData(data)
data = RandomCropData(data, self.image_shape[1:])
data = MakeShrinkMap(data)
data = MakeBorderMap(data)
data = self.NormalizeImage(data)
data = self.FilterKeys(data)
return data['image'], data['shrink_map'], data['shrink_mask'], data[
'threshold_map'], data['threshold_mask']
class DBProcessTest(object):
def __init__(self, params):
super(DBProcessTest, self).__init__()
self.resize_type = 0
if 'det_image_shape' in params:
self.image_shape = params['det_image_shape']
# print(self.image_shape)
self.resize_type = 1
if 'max_side_len' in params:
self.max_side_len = params['max_side_len']
else:
self.max_side_len = 2400
def resize_image_type0(self, im):
"""
resize image to a size multiple of 32 which is required by the network
:param im: the resized image
:param max_side_len: limit of max image size to avoid out of memory in gpu
:return: the resized image and the resize ratio
"""
max_side_len = self.max_side_len
h, w, _ = im.shape
resize_w = w
resize_h = h
# limit the max side
if max(resize_h, resize_w) > max_side_len:
if resize_h > resize_w:
ratio = float(max_side_len) / resize_h
else:
ratio = float(max_side_len) / resize_w
else:
ratio = 1.
resize_h = int(resize_h * ratio)
resize_w = int(resize_w * ratio)
if resize_h % 32 == 0:
resize_h = resize_h
else:
resize_h = (resize_h // 32 + 1) * 32
if resize_w % 32 == 0:
resize_w = resize_w
else:
resize_w = (resize_w // 32 + 1) * 32
try:
if int(resize_w) <= 0 or int(resize_h) <= 0:
return None, (None, None)
im = cv2.resize(im, (int(resize_w), int(resize_h)))
except:
print(im.shape, resize_w, resize_h)
sys.exit(0)
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return im, (ratio_h, ratio_w)
def resize_image_type1(self, im):
resize_h, resize_w = self.image_shape
ori_h, ori_w = im.shape[:2] # (h, w, c)
im = cv2.resize(im, (int(resize_w), int(resize_h)))
ratio_h = float(resize_h) / ori_h
ratio_w = float(resize_w) / ori_w
return im, (ratio_h, ratio_w)
def normalize(self, im):
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
im = im.astype(np.float32, copy=False)
im = im / 255
im -= img_mean
im /= img_std
channel_swap = (2, 0, 1)
im = im.transpose(channel_swap)
return im
def __call__(self, im):
if self.resize_type == 0:
im, (ratio_h, ratio_w) = self.resize_image_type0(im)
else:
im, (ratio_h, ratio_w) = self.resize_image_type1(im)
im = self.normalize(im)
im = im[np.newaxis, :]
return [im, (ratio_h, ratio_w)]

509
ppocr/data/det/east_process.py Executable file
View File

@ -0,0 +1,509 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import math
import cv2
import numpy as np
import json
class EASTProcessTrain(object):
def __init__(self, params):
self.img_set_dir = params['img_set_dir']
self.random_scale = np.array([0.5, 1, 2.0, 3.0])
self.background_ratio = params['background_ratio']
self.min_crop_side_ratio = params['min_crop_side_ratio']
image_shape = params['image_shape']
self.input_size = image_shape[1]
self.min_text_size = params['min_text_size']
def preprocess(self, im):
input_size = self.input_size
im_shape = im.shape
im_size_min = np.min(im_shape[0:2])
im_size_max = np.max(im_shape[0:2])
im_scale = float(input_size) / float(im_size_max)
im = cv2.resize(im, None, None, fx=im_scale, fy=im_scale)
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
im = im[:, :, ::-1].astype(np.float32)
im = im / 255
im -= img_mean
im /= img_std
new_h, new_w, _ = im.shape
im_padded = np.zeros((input_size, input_size, 3), dtype=np.float32)
im_padded[:new_h, :new_w, :] = im
im_padded = im_padded.transpose((2, 0, 1))
im_padded = im_padded[np.newaxis, :]
return im_padded, im_scale
def convert_label_infor(self, label_infor):
label_infor = label_infor.decode()
label_infor = label_infor.encode('utf-8').decode('utf-8-sig')
substr = label_infor.strip("\n").split("\t")
img_path = self.img_set_dir + substr[0]
label = json.loads(substr[1])
nBox = len(label)
wordBBs, txts, txt_tags = [], [], []
for bno in range(0, nBox):
wordBB = label[bno]['points']
txt = label[bno]['transcription']
wordBBs.append(wordBB)
txts.append(txt)
if txt == '###':
txt_tags.append(True)
else:
txt_tags.append(False)
wordBBs = np.array(wordBBs, dtype=np.float32)
txt_tags = np.array(txt_tags, dtype=np.bool)
return img_path, wordBBs, txt_tags, txts
def rotate_im_poly(self, im, text_polys):
"""
rotate image with 90 / 180 / 270 degre
"""
im_w, im_h = im.shape[1], im.shape[0]
dst_im = im.copy()
dst_polys = []
rand_degree_ratio = np.random.rand()
rand_degree_cnt = 1
if rand_degree_ratio > 0.333 and rand_degree_ratio < 0.666:
rand_degree_cnt = 2
elif rand_degree_ratio > 0.666:
rand_degree_cnt = 3
for i in range(rand_degree_cnt):
dst_im = np.rot90(dst_im)
rot_degree = -90 * rand_degree_cnt
rot_angle = rot_degree * math.pi / 180.0
n_poly = text_polys.shape[0]
cx, cy = 0.5 * im_w, 0.5 * im_h
ncx, ncy = 0.5 * dst_im.shape[1], 0.5 * dst_im.shape[0]
for i in range(n_poly):
wordBB = text_polys[i]
poly = []
for j in range(4):
sx, sy = wordBB[j][0], wordBB[j][1]
dx = math.cos(rot_angle) * (sx - cx)\
- math.sin(rot_angle) * (sy - cy) + ncx
dy = math.sin(rot_angle) * (sx - cx)\
+ math.cos(rot_angle) * (sy - cy) + ncy
poly.append([dx, dy])
dst_polys.append(poly)
dst_polys = np.array(dst_polys, dtype=np.float32)
return dst_im, dst_polys
def polygon_area(self, poly):
"""
compute area of a polygon
:param poly:
:return:
"""
edge = [(poly[1][0] - poly[0][0]) * (poly[1][1] + poly[0][1]),
(poly[2][0] - poly[1][0]) * (poly[2][1] + poly[1][1]),
(poly[3][0] - poly[2][0]) * (poly[3][1] + poly[2][1]),
(poly[0][0] - poly[3][0]) * (poly[0][1] + poly[3][1])]
return np.sum(edge) / 2.
def check_and_validate_polys(self, polys, tags, img_height, img_width):
"""
check so that the text poly is in the same direction,
and also filter some invalid polygons
:param polys:
:param tags:
:return:
"""
h, w = img_height, img_width
if polys.shape[0] == 0:
return polys
polys[:, :, 0] = np.clip(polys[:, :, 0], 0, w - 1)
polys[:, :, 1] = np.clip(polys[:, :, 1], 0, h - 1)
validated_polys = []
validated_tags = []
for poly, tag in zip(polys, tags):
p_area = self.polygon_area(poly)
#invalid poly
if abs(p_area) < 1:
continue
if p_area > 0:
#'poly in wrong direction'
if tag == False:
tag = True #reversed cases should be ignore
poly = poly[(0, 3, 2, 1), :]
validated_polys.append(poly)
validated_tags.append(tag)
return np.array(validated_polys), np.array(validated_tags)
def draw_img_polys(self, img, polys):
if len(img.shape) == 4:
img = np.squeeze(img, axis=0)
if img.shape[0] == 3:
img = img.transpose((1, 2, 0))
img[:, :, 2] += 123.68
img[:, :, 1] += 116.78
img[:, :, 0] += 103.94
cv2.imwrite("tmp.jpg", img)
img = cv2.imread("tmp.jpg")
for box in polys:
box = box.astype(np.int32).reshape((-1, 1, 2))
cv2.polylines(img, [box], True, color=(255, 255, 0), thickness=2)
import random
ino = random.randint(0, 100)
cv2.imwrite("tmp_%d.jpg" % ino, img)
return
def shrink_poly(self, poly, r):
"""
fit a poly inside the origin poly, maybe bugs here...
used for generate the score map
:param poly: the text poly
:param r: r in the paper
:return: the shrinked poly
"""
# shrink ratio
R = 0.3
# find the longer pair
dist0 = np.linalg.norm(poly[0] - poly[1])
dist1 = np.linalg.norm(poly[2] - poly[3])
dist2 = np.linalg.norm(poly[0] - poly[3])
dist3 = np.linalg.norm(poly[1] - poly[2])
if dist0 + dist1 > dist2 + dist3:
# first move (p0, p1), (p2, p3), then (p0, p3), (p1, p2)
## p0, p1
theta = np.arctan2((poly[1][1] - poly[0][1]),
(poly[1][0] - poly[0][0]))
poly[0][0] += R * r[0] * np.cos(theta)
poly[0][1] += R * r[0] * np.sin(theta)
poly[1][0] -= R * r[1] * np.cos(theta)
poly[1][1] -= R * r[1] * np.sin(theta)
## p2, p3
theta = np.arctan2((poly[2][1] - poly[3][1]),
(poly[2][0] - poly[3][0]))
poly[3][0] += R * r[3] * np.cos(theta)
poly[3][1] += R * r[3] * np.sin(theta)
poly[2][0] -= R * r[2] * np.cos(theta)
poly[2][1] -= R * r[2] * np.sin(theta)
## p0, p3
theta = np.arctan2((poly[3][0] - poly[0][0]),
(poly[3][1] - poly[0][1]))
poly[0][0] += R * r[0] * np.sin(theta)
poly[0][1] += R * r[0] * np.cos(theta)
poly[3][0] -= R * r[3] * np.sin(theta)
poly[3][1] -= R * r[3] * np.cos(theta)
## p1, p2
theta = np.arctan2((poly[2][0] - poly[1][0]),
(poly[2][1] - poly[1][1]))
poly[1][0] += R * r[1] * np.sin(theta)
poly[1][1] += R * r[1] * np.cos(theta)
poly[2][0] -= R * r[2] * np.sin(theta)
poly[2][1] -= R * r[2] * np.cos(theta)
else:
## p0, p3
# print poly
theta = np.arctan2((poly[3][0] - poly[0][0]),
(poly[3][1] - poly[0][1]))
poly[0][0] += R * r[0] * np.sin(theta)
poly[0][1] += R * r[0] * np.cos(theta)
poly[3][0] -= R * r[3] * np.sin(theta)
poly[3][1] -= R * r[3] * np.cos(theta)
## p1, p2
theta = np.arctan2((poly[2][0] - poly[1][0]),
(poly[2][1] - poly[1][1]))
poly[1][0] += R * r[1] * np.sin(theta)
poly[1][1] += R * r[1] * np.cos(theta)
poly[2][0] -= R * r[2] * np.sin(theta)
poly[2][1] -= R * r[2] * np.cos(theta)
## p0, p1
theta = np.arctan2((poly[1][1] - poly[0][1]),
(poly[1][0] - poly[0][0]))
poly[0][0] += R * r[0] * np.cos(theta)
poly[0][1] += R * r[0] * np.sin(theta)
poly[1][0] -= R * r[1] * np.cos(theta)
poly[1][1] -= R * r[1] * np.sin(theta)
## p2, p3
theta = np.arctan2((poly[2][1] - poly[3][1]),
(poly[2][0] - poly[3][0]))
poly[3][0] += R * r[3] * np.cos(theta)
poly[3][1] += R * r[3] * np.sin(theta)
poly[2][0] -= R * r[2] * np.cos(theta)
poly[2][1] -= R * r[2] * np.sin(theta)
return poly
def generate_quad(self, im_size, polys, tags):
"""
Generate quadrangle.
"""
h, w = im_size
poly_mask = np.zeros((h, w), dtype=np.uint8)
score_map = np.zeros((h, w), dtype=np.uint8)
# (x1, y1, ..., x4, y4, short_edge_norm)
geo_map = np.zeros((h, w, 9), dtype=np.float32)
# mask used during traning, to ignore some hard areas
training_mask = np.ones((h, w), dtype=np.uint8)
for poly_idx, poly_tag in enumerate(zip(polys, tags)):
poly = poly_tag[0]
tag = poly_tag[1]
r = [None, None, None, None]
for i in range(4):
dist1 = np.linalg.norm(poly[i] - poly[(i + 1) % 4])
dist2 = np.linalg.norm(poly[i] - poly[(i - 1) % 4])
r[i] = min(dist1, dist2)
# score map
shrinked_poly = self.shrink_poly(
poly.copy(), r).astype(np.int32)[np.newaxis, :, :]
cv2.fillPoly(score_map, shrinked_poly, 1)
cv2.fillPoly(poly_mask, shrinked_poly, poly_idx + 1)
# if the poly is too small, then ignore it during training
poly_h = min(
np.linalg.norm(poly[0] - poly[3]),
np.linalg.norm(poly[1] - poly[2]))
poly_w = min(
np.linalg.norm(poly[0] - poly[1]),
np.linalg.norm(poly[2] - poly[3]))
if min(poly_h, poly_w) < self.min_text_size:
cv2.fillPoly(training_mask,
poly.astype(np.int32)[np.newaxis, :, :], 0)
if tag:
cv2.fillPoly(training_mask,
poly.astype(np.int32)[np.newaxis, :, :], 0)
xy_in_poly = np.argwhere(poly_mask == (poly_idx + 1))
# geo map.
y_in_poly = xy_in_poly[:, 0]
x_in_poly = xy_in_poly[:, 1]
poly[:, 0] = np.minimum(np.maximum(poly[:, 0], 0), w)
poly[:, 1] = np.minimum(np.maximum(poly[:, 1], 0), h)
for pno in range(4):
geo_channel_beg = pno * 2
geo_map[y_in_poly, x_in_poly, geo_channel_beg] =\
x_in_poly - poly[pno, 0]
geo_map[y_in_poly, x_in_poly, geo_channel_beg+1] =\
y_in_poly - poly[pno, 1]
geo_map[y_in_poly, x_in_poly, 8] = \
1.0 / max(min(poly_h, poly_w), 1.0)
return score_map, geo_map, training_mask
def crop_area(self,
im,
polys,
tags,
txts,
crop_background=False,
max_tries=50):
"""
make random crop from the input image
:param im:
:param polys:
:param tags:
:param crop_background:
:param max_tries:
:return:
"""
h, w, _ = im.shape
pad_h = h // 10
pad_w = w // 10
h_array = np.zeros((h + pad_h * 2), dtype=np.int32)
w_array = np.zeros((w + pad_w * 2), dtype=np.int32)
for poly in polys:
poly = np.round(poly, decimals=0).astype(np.int32)
minx = np.min(poly[:, 0])
maxx = np.max(poly[:, 0])
w_array[minx + pad_w:maxx + pad_w] = 1
miny = np.min(poly[:, 1])
maxy = np.max(poly[:, 1])
h_array[miny + pad_h:maxy + pad_h] = 1
# ensure the cropped area not across a text
h_axis = np.where(h_array == 0)[0]
w_axis = np.where(w_array == 0)[0]
if len(h_axis) == 0 or len(w_axis) == 0:
return im, polys, tags, txts
for i in range(max_tries):
xx = np.random.choice(w_axis, size=2)
xmin = np.min(xx) - pad_w
xmax = np.max(xx) - pad_w
xmin = np.clip(xmin, 0, w - 1)
xmax = np.clip(xmax, 0, w - 1)
yy = np.random.choice(h_axis, size=2)
ymin = np.min(yy) - pad_h
ymax = np.max(yy) - pad_h
ymin = np.clip(ymin, 0, h - 1)
ymax = np.clip(ymax, 0, h - 1)
if xmax - xmin < self.min_crop_side_ratio * w or \
ymax - ymin < self.min_crop_side_ratio * h:
# area too small
continue
if polys.shape[0] != 0:
poly_axis_in_area = (polys[:, :, 0] >= xmin)\
& (polys[:, :, 0] <= xmax)\
& (polys[:, :, 1] >= ymin)\
& (polys[:, :, 1] <= ymax)
selected_polys = np.where(
np.sum(poly_axis_in_area, axis=1) == 4)[0]
else:
selected_polys = []
if len(selected_polys) == 0:
# no text in this area
if crop_background:
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
polys = []
tags = []
txts = []
return im, polys, tags, txts
else:
continue
im = im[ymin:ymax + 1, xmin:xmax + 1, :]
polys = polys[selected_polys]
tags = tags[selected_polys]
txts_tmp = []
for selected_poly in selected_polys:
txts_tmp.append(txts[selected_poly])
txts = txts_tmp
polys[:, :, 0] -= xmin
polys[:, :, 1] -= ymin
return im, polys, tags, txts
return im, polys, tags, txts
def crop_background_infor(self, im, text_polys, text_tags, text_strs):
im, text_polys, text_tags, text_strs = self.crop_area(
im, text_polys, text_tags, text_strs, crop_background=True)
if len(text_polys) > 0:
return None
# pad and resize image
input_size = self.input_size
im, ratio = self.preprocess(im)
score_map = np.zeros((input_size, input_size), dtype=np.float32)
geo_map = np.zeros((input_size, input_size, 9), dtype=np.float32)
training_mask = np.ones((input_size, input_size), dtype=np.float32)
return im, score_map, geo_map, training_mask
def crop_foreground_infor(self, im, text_polys, text_tags, text_strs):
im, text_polys, text_tags, text_strs = self.crop_area(
im, text_polys, text_tags, text_strs, crop_background=False)
if text_polys.shape[0] == 0:
return None
#continue for all ignore case
if np.sum((text_tags * 1.0)) >= text_tags.size:
return None
# pad and resize image
input_size = self.input_size
im, ratio = self.preprocess(im)
text_polys[:, :, 0] *= ratio
text_polys[:, :, 1] *= ratio
_, _, new_h, new_w = im.shape
# print(im.shape)
# self.draw_img_polys(im, text_polys)
score_map, geo_map, training_mask = self.generate_quad(
(new_h, new_w), text_polys, text_tags)
return im, score_map, geo_map, training_mask
def __call__(self, label_infor):
infor = self.convert_label_infor(label_infor)
im_path, text_polys, text_tags, text_strs = infor
im = cv2.imread(im_path)
if im is None:
return None
if text_polys.shape[0] == 0:
return None
#add rotate cases
if np.random.rand() < 0.5:
im, text_polys = self.rotate_im_poly(im, text_polys)
h, w, _ = im.shape
text_polys, text_tags = self.check_and_validate_polys(text_polys,
text_tags, h, w)
if text_polys.shape[0] == 0:
return None
# random scale this image
rd_scale = np.random.choice(self.random_scale)
im = cv2.resize(im, dsize=None, fx=rd_scale, fy=rd_scale)
text_polys *= rd_scale
if np.random.rand() < self.background_ratio:
outs = self.crop_background_infor(im, text_polys, text_tags,
text_strs)
else:
outs = self.crop_foreground_infor(im, text_polys, text_tags,
text_strs)
if outs is None:
return None
im, score_map, geo_map, training_mask = outs
score_map = score_map[np.newaxis, ::4, ::4].astype(np.float32)
geo_map = np.swapaxes(geo_map, 1, 2)
geo_map = np.swapaxes(geo_map, 1, 0)
geo_map = geo_map[:, ::4, ::4].astype(np.float32)
training_mask = training_mask[np.newaxis, ::4, ::4]
training_mask = training_mask.astype(np.float32)
return im, score_map, geo_map, training_mask
class EASTProcessTest(object):
def __init__(self, params):
super(EASTProcessTest, self).__init__()
if 'max_side_len' in params:
self.max_side_len = params['max_side_len']
else:
self.max_side_len = 2400
def resize_image(self, im):
"""
resize image to a size multiple of 32 which is required by the network
:param im: the resized image
:param max_side_len: limit of max image size to avoid out of memory in gpu
:return: the resized image and the resize ratio
"""
max_side_len = self.max_side_len
h, w, _ = im.shape
resize_w = w
resize_h = h
# limit the max side
if max(resize_h, resize_w) > max_side_len:
if resize_h > resize_w:
ratio = float(max_side_len) / resize_h
else:
ratio = float(max_side_len) / resize_w
else:
ratio = 1.
resize_h = int(resize_h * ratio)
resize_w = int(resize_w * ratio)
if resize_h % 32 == 0:
resize_h = resize_h
else:
resize_h = (resize_h // 32 - 1) * 32
if resize_w % 32 == 0:
resize_w = resize_w
else:
resize_w = (resize_w // 32 - 1) * 32
im = cv2.resize(im, (int(resize_w), int(resize_h)))
ratio_h = resize_h / float(h)
ratio_w = resize_w / float(w)
return im, (ratio_h, ratio_w)
def __call__(self, im):
im, (ratio_h, ratio_w) = self.resize_image(im)
img_mean = [0.485, 0.456, 0.406]
img_std = [0.229, 0.224, 0.225]
im = im[:, :, ::-1].astype(np.float32)
im = im / 255
im -= img_mean
im /= img_std
im = im.transpose((2, 0, 1))
im = im[np.newaxis, :]
return [im, (ratio_h, ratio_w)]

View File

@ -0,0 +1,147 @@
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import cv2
np.seterr(divide='ignore', invalid='ignore')
import pyclipper
from shapely.geometry import Polygon
import sys
import warnings
warnings.simplefilter("ignore")
def draw_border_map(polygon, canvas, mask, shrink_ratio):
polygon = np.array(polygon)
assert polygon.ndim == 2
assert polygon.shape[1] == 2
polygon_shape = Polygon(polygon)
if polygon_shape.area <= 0:
return
distance = polygon_shape.area * (
1 - np.power(shrink_ratio, 2)) / polygon_shape.length
subject = [tuple(l) for l in polygon]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
padded_polygon = np.array(padding.Execute(distance)[0])
cv2.fillPoly(mask, [padded_polygon.astype(np.int32)], 1.0)
xmin = padded_polygon[:, 0].min()
xmax = padded_polygon[:, 0].max()
ymin = padded_polygon[:, 1].min()
ymax = padded_polygon[:, 1].max()
width = xmax - xmin + 1
height = ymax - ymin + 1
polygon[:, 0] = polygon[:, 0] - xmin
polygon[:, 1] = polygon[:, 1] - ymin
xs = np.broadcast_to(
np.linspace(
0, width - 1, num=width).reshape(1, width), (height, width))
ys = np.broadcast_to(
np.linspace(
0, height - 1, num=height).reshape(height, 1), (height, width))
distance_map = np.zeros((polygon.shape[0], height, width), dtype=np.float32)
for i in range(polygon.shape[0]):
j = (i + 1) % polygon.shape[0]
absolute_distance = _distance(xs, ys, polygon[i], polygon[j])
distance_map[i] = np.clip(absolute_distance / distance, 0, 1)
distance_map = distance_map.min(axis=0)
xmin_valid = min(max(0, xmin), canvas.shape[1] - 1)
xmax_valid = min(max(0, xmax), canvas.shape[1] - 1)
ymin_valid = min(max(0, ymin), canvas.shape[0] - 1)
ymax_valid = min(max(0, ymax), canvas.shape[0] - 1)
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1] = np.fmax(
1 - distance_map[ymin_valid - ymin:ymax_valid - ymax + height,
xmin_valid - xmin:xmax_valid - xmax + width],
canvas[ymin_valid:ymax_valid + 1, xmin_valid:xmax_valid + 1])
def _distance(xs, ys, point_1, point_2):
'''
compute the distance from point to a line
ys: coordinates in the first axis
xs: coordinates in the second axis
point_1, point_2: (x, y), the end of the line
'''
height, width = xs.shape[:2]
square_distance_1 = np.square(xs - point_1[0]) + np.square(ys - point_1[1])
square_distance_2 = np.square(xs - point_2[0]) + np.square(ys - point_2[1])
square_distance = np.square(point_1[0] - point_2[0]) + np.square(point_1[
1] - point_2[1])
cosin = (square_distance - square_distance_1 - square_distance_2) / (
2 * np.sqrt(square_distance_1 * square_distance_2))
square_sin = 1 - np.square(cosin)
square_sin = np.nan_to_num(square_sin)
result = np.sqrt(square_distance_1 * square_distance_2 * square_sin /
square_distance)
result[cosin <
0] = np.sqrt(np.fmin(square_distance_1, square_distance_2))[cosin <
0]
# self.extend_line(point_1, point_2, result)
return result
def extend_line(point_1, point_2, result, shrink_ratio):
ex_point_1 = (
int(
round(point_1[0] + (point_1[0] - point_2[0]) * (1 + shrink_ratio))),
int(
round(point_1[1] + (point_1[1] - point_2[1]) * (1 + shrink_ratio))))
cv2.line(
result,
tuple(ex_point_1),
tuple(point_1),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0)
ex_point_2 = (
int(
round(point_2[0] + (point_2[0] - point_1[0]) * (1 + shrink_ratio))),
int(
round(point_2[1] + (point_2[1] - point_1[1]) * (1 + shrink_ratio))))
cv2.line(
result,
tuple(ex_point_2),
tuple(point_2),
4096.0,
1,
lineType=cv2.LINE_AA,
shift=0)
return ex_point_1, ex_point_2
def MakeBorderMap(data):
shrink_ratio = 0.4
thresh_min = 0.3
thresh_max = 0.7
im = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
canvas = np.zeros(im.shape[:2], dtype=np.float32)
mask = np.zeros(im.shape[:2], dtype=np.float32)
for i in range(len(text_polys)):
if ignore_tags[i]:
continue
draw_border_map(
text_polys[i], canvas, mask=mask, shrink_ratio=shrink_ratio)
canvas = canvas * (thresh_max - thresh_min) + thresh_min
data['threshold_map'] = canvas
data['threshold_mask'] = mask
return data

View File

@ -0,0 +1,88 @@
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import cv2
from shapely.geometry import Polygon
import pyclipper
def validate_polygons(polygons, ignore_tags, h, w):
'''
polygons (numpy.array, required): of shape (num_instances, num_points, 2)
'''
if len(polygons) == 0:
return polygons, ignore_tags
assert len(polygons) == len(ignore_tags)
for polygon in polygons:
polygon[:, 0] = np.clip(polygon[:, 0], 0, w - 1)
polygon[:, 1] = np.clip(polygon[:, 1], 0, h - 1)
for i in range(len(polygons)):
area = polygon_area(polygons[i])
if abs(area) < 1:
ignore_tags[i] = True
if area > 0:
polygons[i] = polygons[i][::-1, :]
return polygons, ignore_tags
def polygon_area(polygon):
edge = 0
for i in range(polygon.shape[0]):
next_index = (i + 1) % polygon.shape[0]
edge += (polygon[next_index, 0] - polygon[i, 0]) * (
polygon[next_index, 1] - polygon[i, 1])
return edge / 2.
def MakeShrinkMap(data):
min_text_size = 8
shrink_ratio = 0.4
image = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
h, w = image.shape[:2]
text_polys, ignore_tags = validate_polygons(text_polys, ignore_tags, h, w)
gt = np.zeros((h, w), dtype=np.float32)
# gt = np.zeros((1, h, w), dtype=np.float32)
mask = np.ones((h, w), dtype=np.float32)
for i in range(len(text_polys)):
polygon = text_polys[i]
height = max(polygon[:, 1]) - min(polygon[:, 1])
width = max(polygon[:, 0]) - min(polygon[:, 0])
# height = min(np.linalg.norm(polygon[0] - polygon[3]),
# np.linalg.norm(polygon[1] - polygon[2]))
# width = min(np.linalg.norm(polygon[0] - polygon[1]),
# np.linalg.norm(polygon[2] - polygon[3]))
if ignore_tags[i] or min(height, width) < min_text_size:
cv2.fillPoly(mask, polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
else:
polygon_shape = Polygon(polygon)
distance = polygon_shape.area * (
1 - np.power(shrink_ratio, 2)) / polygon_shape.length
subject = [tuple(l) for l in text_polys[i]]
padding = pyclipper.PyclipperOffset()
padding.AddPath(subject, pyclipper.JT_ROUND,
pyclipper.ET_CLOSEDPOLYGON)
shrinked = padding.Execute(-distance)
if shrinked == []:
cv2.fillPoly(mask,
polygon.astype(np.int32)[np.newaxis, :, :], 0)
ignore_tags[i] = True
continue
shrinked = np.array(shrinked[0]).reshape(-1, 2)
cv2.fillPoly(gt, [shrinked.astype(np.int32)], 1)
# cv2.fillPoly(gt[0], [shrinked.astype(np.int32)], 1)
data['shrink_map'] = gt
data['shrink_mask'] = mask
return data

View File

@ -0,0 +1,155 @@
# -*- coding:utf-8 -*-
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import numpy as np
import cv2
import random
def is_poly_in_rect(poly, x, y, w, h):
poly = np.array(poly)
if poly[:, 0].min() < x or poly[:, 0].max() > x + w:
return False
if poly[:, 1].min() < y or poly[:, 1].max() > y + h:
return False
return True
def is_poly_outside_rect(poly, x, y, w, h):
poly = np.array(poly)
if poly[:, 0].max() < x or poly[:, 0].min() > x + w:
return True
if poly[:, 1].max() < y or poly[:, 1].min() > y + h:
return True
return False
def split_regions(axis):
regions = []
min_axis = 0
for i in range(1, axis.shape[0]):
if axis[i] != axis[i - 1] + 1:
region = axis[min_axis:i]
min_axis = i
regions.append(region)
return regions
def random_select(axis, max_size):
xx = np.random.choice(axis, size=2)
xmin = np.min(xx)
xmax = np.max(xx)
xmin = np.clip(xmin, 0, max_size - 1)
xmax = np.clip(xmax, 0, max_size - 1)
return xmin, xmax
def region_wise_random_select(regions, max_size):
selected_index = list(np.random.choice(len(regions), 2))
selected_values = []
for index in selected_index:
axis = regions[index]
xx = int(np.random.choice(axis, size=1))
selected_values.append(xx)
xmin = min(selected_values)
xmax = max(selected_values)
return xmin, xmax
def crop_area(im, text_polys, min_crop_side_ratio, max_tries):
h, w, _ = im.shape
h_array = np.zeros(h, dtype=np.int32)
w_array = np.zeros(w, dtype=np.int32)
for points in text_polys:
points = np.round(points, decimals=0).astype(np.int32)
minx = np.min(points[:, 0])
maxx = np.max(points[:, 0])
w_array[minx:maxx] = 1
miny = np.min(points[:, 1])
maxy = np.max(points[:, 1])
h_array[miny:maxy] = 1
# ensure the cropped area not across a text
h_axis = np.where(h_array == 0)[0]
w_axis = np.where(w_array == 0)[0]
if len(h_axis) == 0 or len(w_axis) == 0:
return 0, 0, w, h
h_regions = split_regions(h_axis)
w_regions = split_regions(w_axis)
for i in range(max_tries):
if len(w_regions) > 1:
xmin, xmax = region_wise_random_select(w_regions, w)
else:
xmin, xmax = random_select(w_axis, w)
if len(h_regions) > 1:
ymin, ymax = region_wise_random_select(h_regions, h)
else:
ymin, ymax = random_select(h_axis, h)
if xmax - xmin < min_crop_side_ratio * w or ymax - ymin < min_crop_side_ratio * h:
# area too small
continue
num_poly_in_rect = 0
for poly in text_polys:
if not is_poly_outside_rect(poly, xmin, ymin, xmax - xmin,
ymax - ymin):
num_poly_in_rect += 1
break
if num_poly_in_rect > 0:
return xmin, ymin, xmax - xmin, ymax - ymin
return 0, 0, w, h
def RandomCropData(data, size):
max_tries = 10
min_crop_side_ratio = 0.1
require_original_image = False
keep_ratio = True
im = data['image']
text_polys = data['polys']
ignore_tags = data['ignore_tags']
texts = data['texts']
all_care_polys = [
text_polys[i] for i, tag in enumerate(ignore_tags) if not tag
]
# 计算crop区域
crop_x, crop_y, crop_w, crop_h = crop_area(im, all_care_polys,
min_crop_side_ratio, max_tries)
# crop 图片 保持比例填充
scale_w = size[0] / crop_w
scale_h = size[1] / crop_h
scale = min(scale_w, scale_h)
h = int(crop_h * scale)
w = int(crop_w * scale)
if keep_ratio:
padimg = np.zeros((size[1], size[0], im.shape[2]), im.dtype)
padimg[:h, :w] = cv2.resize(
im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w], (w, h))
img = padimg
else:
img = cv2.resize(im[crop_y:crop_y + crop_h, crop_x:crop_x + crop_w],
tuple(size))
# crop 文本框
text_polys_crop = []
ignore_tags_crop = []
texts_crop = []
for poly, text, tag in zip(text_polys, texts, ignore_tags):
poly = ((poly - (crop_x, crop_y)) * scale).tolist()
if not is_poly_outside_rect(poly, 0, 0, w, h):
text_polys_crop.append(poly)
ignore_tags_crop.append(tag)
texts_crop.append(text)
data['image'] = img
data['polys'] = np.array(text_polys_crop)
data['ignore_tags'] = ignore_tags_crop
data['texts'] = texts_crop
return data

81
ppocr/data/reader_main.py Executable file
View File

@ -0,0 +1,81 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import random
import numpy as np
import paddle
from ppocr.utils.utility import create_module
from copy import deepcopy
from .rec.img_tools import process_image
import cv2
import sys
import signal
# handle terminate reader process, do not print stack frame
def _reader_quit(signum, frame):
print("Reader process exit.")
sys.exit()
def _term_group(sig_num, frame):
print('pid {} terminated, terminate group '
'{}...'.format(os.getpid(), os.getpgrp()))
os.killpg(os.getpgid(os.getpid()), signal.SIGKILL)
signal.signal(signal.SIGTERM, _reader_quit)
signal.signal(signal.SIGINT, _term_group)
def reader_main(config=None, mode=None):
"""Create a reader for trainning
Args:
settings: arguments
Returns:
train reader
"""
assert mode in ["train", "eval", "test"],\
"Nonsupport mode:{}".format(mode)
global_params = config['Global']
if mode == "train":
params = deepcopy(config['TrainReader'])
elif mode == "eval":
params = deepcopy(config['EvalReader'])
else:
params = deepcopy(config['TestReader'])
params['mode'] = mode
params.update(global_params)
reader_function = params['reader_function']
function = create_module(reader_function)(params)
if mode == "train":
readers = []
num_workers = params['num_workers']
for process_id in range(num_workers):
readers.append(function(process_id))
return paddle.reader.multiprocess_reader(readers, False)
else:
return function(mode)
def test_reader(image_shape, img_path):
img = cv2.imread(img_path)
norm_img = process_image(img, image_shape)
return norm_img

13
ppocr/data/rec/__init__.py Executable file
View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,201 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import os
import math
import random
import numpy as np
import cv2
import string
import lmdb
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from .img_tools import process_image, get_img_data
class LMDBReader(object):
def __init__(self, params):
if params['mode'] != 'train':
self.num_workers = 1
else:
self.num_workers = params['num_workers']
self.lmdb_sets_dir = params['lmdb_sets_dir']
self.char_ops = params['char_ops']
self.image_shape = params['image_shape']
self.loss_type = params['loss_type']
self.max_text_length = params['max_text_length']
self.mode = params['mode']
if params['mode'] == 'train':
self.batch_size = params['train_batch_size_per_card']
else:
self.batch_size = params['test_batch_size_per_card']
def load_hierarchical_lmdb_dataset(self):
lmdb_sets = {}
dataset_idx = 0
for dirpath, dirnames, filenames in os.walk(self.lmdb_sets_dir + '/'):
if not dirnames:
env = lmdb.open(
dirpath,
max_readers=32,
readonly=True,
lock=False,
readahead=False,
meminit=False)
txn = env.begin(write=False)
num_samples = int(txn.get('num-samples'.encode()))
lmdb_sets[dataset_idx] = {"dirpath":dirpath, "env":env, \
"txn":txn, "num_samples":num_samples}
dataset_idx += 1
return lmdb_sets
def print_lmdb_sets_info(self, lmdb_sets):
lmdb_info_strs = []
for dataset_idx in range(len(lmdb_sets)):
tmp_str = " %s:%d," % (lmdb_sets[dataset_idx]['dirpath'],
lmdb_sets[dataset_idx]['num_samples'])
lmdb_info_strs.append(tmp_str)
lmdb_info_strs = ''.join(lmdb_info_strs)
logger.info("DataSummary:" + lmdb_info_strs)
return
def close_lmdb_dataset(self, lmdb_sets):
for dataset_idx in lmdb_sets:
lmdb_sets[dataset_idx]['env'].close()
return
def get_lmdb_sample_info(self, txn, index):
label_key = 'label-%09d'.encode() % index
label = txn.get(label_key)
if label is None:
return None
label = label.decode('utf-8')
img_key = 'image-%09d'.encode() % index
imgbuf = txn.get(img_key)
img = get_img_data(imgbuf)
if img is None:
return None
return img, label
def __call__(self, process_id):
if self.mode != 'train':
process_id = 0
def sample_iter_reader():
lmdb_sets = self.load_hierarchical_lmdb_dataset()
if process_id == 0:
self.print_lmdb_sets_info(lmdb_sets)
cur_index_sets = [1 + process_id] * len(lmdb_sets)
while True:
finish_read_num = 0
for dataset_idx in range(len(lmdb_sets)):
cur_index = cur_index_sets[dataset_idx]
if cur_index > lmdb_sets[dataset_idx]['num_samples']:
finish_read_num += 1
else:
sample_info = self.get_lmdb_sample_info(
lmdb_sets[dataset_idx]['txn'], cur_index)
cur_index_sets[dataset_idx] += self.num_workers
if sample_info is None:
continue
img, label = sample_info
outs = process_image(img, self.image_shape, label,
self.char_ops, self.loss_type,
self.max_text_length)
if outs is None:
continue
yield outs
if finish_read_num == len(lmdb_sets):
break
self.close_lmdb_dataset(lmdb_sets)
def batch_iter_reader():
batch_outs = []
for outs in sample_iter_reader():
batch_outs.append(outs)
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
return batch_iter_reader
class SimpleReader(object):
def __init__(self, params):
if params['mode'] != 'train':
self.num_workers = 1
else:
self.num_workers = params['num_workers']
self.img_set_dir = params['img_set_dir']
self.label_file_path = params['label_file_path']
self.char_ops = params['char_ops']
self.image_shape = params['image_shape']
self.loss_type = params['loss_type']
self.max_text_length = params['max_text_length']
self.mode = params['mode']
if params['mode'] == 'train':
self.batch_size = params['train_batch_size_per_card']
elif params['mode'] == 'eval':
self.batch_size = params['test_batch_size_per_card']
else:
self.batch_size = 1
self.infer_img = params['infer_img']
def __call__(self, process_id):
if self.mode != 'train':
process_id = 0
def sample_iter_reader():
if self.mode == 'test':
print("infer_img:", self.infer_img)
img = cv2.imread(self.infer_img)
norm_img = process_image(img, self.image_shape)
yield norm_img
with open(self.label_file_path, "rb") as fin:
label_infor_list = fin.readlines()
img_num = len(label_infor_list)
img_id_list = list(range(img_num))
random.shuffle(img_id_list)
for img_id in range(process_id, img_num, self.num_workers):
label_infor = label_infor_list[img_id_list[img_id]]
substr = label_infor.decode('utf-8').strip("\n").split("\t")
img_path = self.img_set_dir + "/" + substr[0]
img = cv2.imread(img_path)
if img is None:
continue
label = substr[1]
outs = process_image(img, self.image_shape, label,
self.char_ops, self.loss_type,
self.max_text_length)
if outs is None:
continue
yield outs
def batch_iter_reader():
batch_outs = []
for outs in sample_iter_reader():
batch_outs.append(outs)
if len(batch_outs) == self.batch_size:
yield batch_outs
batch_outs = []
if len(batch_outs) != 0:
yield batch_outs
return batch_iter_reader

92
ppocr/data/rec/img_tools.py Executable file
View File

@ -0,0 +1,92 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
import math
import cv2
import numpy as np
def get_bounding_box_rect(pos):
left = min(pos[0])
right = max(pos[0])
top = min(pos[1])
bottom = max(pos[1])
return [left, top, right, bottom]
def resize_norm_img(img, image_shape):
imgC, imgH, imgW = image_shape
h = img.shape[0]
w = img.shape[1]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
if image_shape[0] == 1:
resized_image = resized_image / 255
resized_image = resized_image[np.newaxis, :]
else:
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def get_img_data(value):
"""get_img_data"""
if not value:
return None
imgdata = np.frombuffer(value, dtype='uint8')
if imgdata is None:
return None
imgori = cv2.imdecode(imgdata, 1)
if imgori is None:
return None
return imgori
def process_image(img,
image_shape,
label=None,
char_ops=None,
loss_type=None,
max_text_length=None):
norm_img = resize_norm_img(img, image_shape)
norm_img = norm_img[np.newaxis, :]
if label is not None:
char_num = char_ops.get_char_num()
text = char_ops.encode(label)
if len(text) == 0 or len(text) > max_text_length:
return None
else:
if loss_type == "ctc":
text = text.reshape(-1, 1)
return (norm_img, text)
elif loss_type == "attention":
beg_flag_idx = char_ops.get_beg_end_flag_idx("beg")
end_flag_idx = char_ops.get_beg_end_flag_idx("end")
beg_text = np.append(beg_flag_idx, text)
end_text = np.append(text, end_flag_idx)
beg_text = beg_text.reshape(-1, 1)
end_text = end_text.reshape(-1, 1)
return (norm_img, beg_text, end_text)
else:
assert False, "Unsupport loss_type %s in process_image"\
% loss_type
return (norm_img)

13
ppocr/modeling/__init__.py Executable file
View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,119 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from copy import deepcopy
class DetModel(object):
def __init__(self, params):
"""
Detection module for OCR text detection.
args:
params (dict): the super parameters for detection module.
"""
global_params = params['Global']
self.algorithm = global_params['algorithm']
backbone_params = deepcopy(params["Backbone"])
backbone_params.update(global_params)
self.backbone = create_module(backbone_params['function'])\
(params=backbone_params)
head_params = deepcopy(params["Head"])
head_params.update(global_params)
self.head = create_module(head_params['function'])\
(params=head_params)
loss_params = deepcopy(params["Loss"])
loss_params.update(global_params)
self.loss = create_module(loss_params['function'])\
(params=loss_params)
self.image_shape = global_params['image_shape']
def create_feed(self, mode):
"""
create Dataloader feeds
args:
mode (str): 'train' for training or else for evaluation
return: (image, corresponding label, dataloader)
"""
image_shape = deepcopy(self.image_shape)
image = fluid.layers.data(
name='image', shape=image_shape, dtype='float32')
if mode == "train":
if self.algorithm == "EAST":
score = fluid.layers.data(
name='score', shape=[1, 128, 128], dtype='float32')
geo = fluid.layers.data(
name='geo', shape=[9, 128, 128], dtype='float32')
mask = fluid.layers.data(
name='mask', shape=[1, 128, 128], dtype='float32')
feed_list = [image, score, geo, mask]
labels = {'score': score, 'geo': geo, 'mask': mask}
elif self.algorithm == "DB":
shrink_map = fluid.layers.data(
name='shrink_map', shape=image_shape[1:], dtype='float32')
shrink_mask = fluid.layers.data(
name='shrink_mask', shape=image_shape[1:], dtype='float32')
threshold_map = fluid.layers.data(
name='threshold_map',
shape=image_shape[1:],
dtype='float32')
threshold_mask = fluid.layers.data(
name='threshold_mask',
shape=image_shape[1:],
dtype='float32')
feed_list=[image, shrink_map, shrink_mask,\
threshold_map, threshold_mask]
labels = {'shrink_map':shrink_map,\
'shrink_mask':shrink_mask,\
'threshold_map':threshold_map,\
'threshold_mask':threshold_mask}
loader = fluid.io.DataLoader.from_generator(
feed_list=feed_list,
capacity=64,
use_double_buffer=True,
iterable=False)
else:
labels = None
loader = None
return image, labels, loader
def __call__(self, mode):
"""
run forward of defined module
args:
mode (str): 'train' for training; 'export' for inference,
others for evaluation]
"""
image, labels, loader = self.create_feed(mode)
conv_feas = self.backbone(image)
predicts = self.head(conv_feas)
if mode == "train":
losses = self.loss(predicts, labels)
return loader, losses
elif mode == "export":
return [image, predicts]
else:
return loader, predicts

View File

@ -0,0 +1,114 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from copy import deepcopy
class RecModel(object):
def __init__(self, params):
super(RecModel, self).__init__()
global_params = params['Global']
char_num = global_params['char_ops'].get_char_num()
global_params['char_num'] = char_num
if "TPS" in params:
tps_params = deepcopy(params["TPS"])
tps_params.update(global_params)
self.tps = create_module(tps_params['function'])\
(params=tps_params)
else:
self.tps = None
backbone_params = deepcopy(params["Backbone"])
backbone_params.update(global_params)
self.backbone = create_module(backbone_params['function'])\
(params=backbone_params)
head_params = deepcopy(params["Head"])
head_params.update(global_params)
self.head = create_module(head_params['function'])\
(params=head_params)
loss_params = deepcopy(params["Loss"])
loss_params.update(global_params)
self.loss = create_module(loss_params['function'])\
(params=loss_params)
self.loss_type = global_params['loss_type']
self.image_shape = global_params['image_shape']
self.max_text_length = global_params['max_text_length']
def create_feed(self, mode):
image_shape = deepcopy(self.image_shape)
image_shape.insert(0, -1)
image = fluid.data(name='image', shape=image_shape, dtype='float32')
if mode == "train":
if self.loss_type == "attention":
label_in = fluid.data(
name='label_in',
shape=[None, 1],
dtype='int32',
lod_level=1)
label_out = fluid.data(
name='label_out',
shape=[None, 1],
dtype='int32',
lod_level=1)
feed_list = [image, label_in, label_out]
labels = {'label_in': label_in, 'label_out': label_out}
else:
label = fluid.data(
name='label', shape=[None, 1], dtype='int32', lod_level=1)
feed_list = [image, label]
labels = {'label': label}
loader = fluid.io.DataLoader.from_generator(
feed_list=feed_list,
capacity=64,
use_double_buffer=True,
iterable=False)
else:
labels = None
loader = None
return image, labels, loader
def __call__(self, mode):
image, labels, loader = self.create_feed(mode)
if self.tps is None:
inputs = image
else:
inputs = self.tps(image)
conv_feas = self.backbone(inputs)
predicts = self.head(conv_feas, labels, mode)
decoded_out = predicts['decoded_out']
if mode == "train":
loss = self.loss(predicts, labels)
if self.loss_type == "attention":
label = labels['label_out']
else:
label = labels['label']
outputs = {'total_loss':loss, 'decoded_out':\
decoded_out, 'label':label}
return loader, outputs
elif mode == "export":
return [image, {'decoded_out': decoded_out}]
else:
return loader, {'decoded_out': decoded_out}

View File

@ -0,0 +1,251 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
__all__ = ['MobileNetV3']
class MobileNetV3():
def __init__(self, params):
"""
the MobilenetV3 backbone network for detection module.
Args:
params(dict): the super parameters for build network
"""
self.scale = params['scale']
model_name = params['model_name']
self.inplanes = 16
if model_name == "large":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, 'relu', 1],
[3, 64, 24, False, 'relu', 2],
[3, 72, 24, False, 'relu', 1],
[5, 72, 40, True, 'relu', 2],
[5, 120, 40, True, 'relu', 1],
[5, 120, 40, True, 'relu', 1],
[3, 240, 80, False, 'hard_swish', 2],
[3, 200, 80, False, 'hard_swish', 1],
[3, 184, 80, False, 'hard_swish', 1],
[3, 184, 80, False, 'hard_swish', 1],
[3, 480, 112, True, 'hard_swish', 1],
[3, 672, 112, True, 'hard_swish', 1],
[5, 672, 160, True, 'hard_swish', 2],
[5, 960, 160, True, 'hard_swish', 1],
[5, 960, 160, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 960
self.cls_ch_expand = 1280
elif model_name == "small":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, 'relu', 2],
[3, 72, 24, False, 'relu', 2],
[3, 88, 24, False, 'relu', 1],
[5, 96, 40, True, 'hard_swish', 2],
[5, 240, 40, True, 'hard_swish', 1],
[5, 240, 40, True, 'hard_swish', 1],
[5, 120, 48, True, 'hard_swish', 1],
[5, 144, 48, True, 'hard_swish', 1],
[5, 288, 96, True, 'hard_swish', 2],
[5, 576, 96, True, 'hard_swish', 1],
[5, 576, 96, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 576
self.cls_ch_expand = 1280
else:
raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!")
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
assert self.scale in supported_scale, \
"supported scale are {} but input scale is {}".format(supported_scale, self.scale)
def __call__(self, input):
scale = self.scale
inplanes = self.inplanes
cfg = self.cfg
cls_ch_squeeze = self.cls_ch_squeeze
cls_ch_expand = self.cls_ch_expand
#conv1
conv = self.conv_bn_layer(
input,
filter_size=3,
num_filters=self.make_divisible(inplanes * scale),
stride=2,
padding=1,
num_groups=1,
if_act=True,
act='hard_swish',
name='conv1')
i = 0
inplanes = self.make_divisible(inplanes * scale)
outs = []
for layer_cfg in cfg:
if layer_cfg[5] == 2 and i > 2:
outs.append(conv)
conv = self.residual_unit(
input=conv,
num_in_filter=inplanes,
num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
num_out_filter=self.make_divisible(scale * layer_cfg[2]),
act=layer_cfg[4],
stride=layer_cfg[5],
filter_size=layer_cfg[0],
use_se=layer_cfg[3],
name='conv' + str(i + 2))
inplanes = self.make_divisible(scale * layer_cfg[2])
i += 1
conv = self.conv_bn_layer(
input=conv,
filter_size=1,
num_filters=self.make_divisible(scale * cls_ch_squeeze),
stride=1,
padding=0,
num_groups=1,
if_act=True,
act='hard_swish',
name='conv_last')
outs.append(conv)
return outs
def conv_bn_layer(self,
input,
filter_size,
num_filters,
stride,
padding,
num_groups=1,
if_act=True,
act=None,
name=None,
use_cudnn=True,
res_last_bn_init=False):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=ParamAttr(name=name + '_weights'),
bias_attr=False)
bn_name = name + '_bn'
bn = fluid.layers.batch_norm(
input=conv,
param_attr=ParamAttr(
name=bn_name + "_scale",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
bias_attr=ParamAttr(
name=bn_name + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
if if_act:
if act == 'relu':
bn = fluid.layers.relu(bn)
elif act == 'hard_swish':
bn = fluid.layers.hard_swish(bn)
return bn
def make_divisible(self, v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
def se_block(self, input, num_out_filter, ratio=4, name=None):
num_mid_filter = num_out_filter // ratio
pool = fluid.layers.pool2d(
input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
conv1 = fluid.layers.conv2d(
input=pool,
filter_size=1,
num_filters=num_mid_filter,
act='relu',
param_attr=ParamAttr(name=name + '_1_weights'),
bias_attr=ParamAttr(name=name + '_1_offset'))
conv2 = fluid.layers.conv2d(
input=conv1,
filter_size=1,
num_filters=num_out_filter,
act='hard_sigmoid',
param_attr=ParamAttr(name=name + '_2_weights'),
bias_attr=ParamAttr(name=name + '_2_offset'))
scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
return scale
def residual_unit(self,
input,
num_in_filter,
num_mid_filter,
num_out_filter,
stride,
filter_size,
act=None,
use_se=False,
name=None):
conv0 = self.conv_bn_layer(
input=input,
filter_size=1,
num_filters=num_mid_filter,
stride=1,
padding=0,
if_act=True,
act=act,
name=name + '_expand')
conv1 = self.conv_bn_layer(
input=conv0,
filter_size=filter_size,
num_filters=num_mid_filter,
stride=stride,
padding=int((filter_size - 1) // 2),
if_act=True,
act=act,
num_groups=num_mid_filter,
use_cudnn=False,
name=name + '_depthwise')
if use_se:
conv1 = self.se_block(
input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
conv2 = self.conv_bn_layer(
input=conv1,
filter_size=1,
num_filters=num_out_filter,
stride=1,
padding=0,
if_act=False,
name=name + '_linear',
res_last_bn_init=True)
if num_in_filter != num_out_filter or stride != 1:
return conv2
else:
return fluid.layers.elementwise_add(x=input, y=conv2, act=None)

View File

@ -0,0 +1,252 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = ["ResNet"]
class ResNet(object):
def __init__(self, params):
"""
the Resnet backbone network for detection module.
Args:
params(dict): the super parameters for network build
"""
self.layers = params['layers']
supported_layers = [18, 34, 50, 101, 152]
assert self.layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
self.is_3x3 = True
def __call__(self, input):
layers = self.layers
is_3x3 = self.is_3x3
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_filters = [64, 128, 256, 512]
outs = []
if is_3x3 == False:
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=2,
act='relu')
else:
conv = self.conv_bn_layer(
input=input,
num_filters=32,
filter_size=3,
stride=2,
act='relu',
name='conv1_1')
conv = self.conv_bn_layer(
input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2')
conv = self.conv_bn_layer(
input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
outs.append(conv)
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
stride=2 if i == 0 and block != 0 else 1,
if_first=block == i == 0,
name=conv_name)
outs.append(conv)
return outs
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def conv_bn_layer_new(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
pool = fluid.layers.pool2d(
input=input,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
conv = fluid.layers.conv2d(
input=pool,
num_filters=num_filters,
filter_size=filter_size,
stride=1,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def shortcut(self, input, ch_out, stride, name, if_first=False):
ch_in = input.shape[1]
if ch_in != ch_out or stride != 1:
if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return self.conv_bn_layer_new(
input, ch_out, 1, stride, name=name)
elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input,
num_filters * 4,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b")
short = self.shortcut(
input,
num_filters,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')

View File

@ -0,0 +1,255 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from paddle.fluid.initializer import MSRA
from paddle.fluid.param_attr import ParamAttr
__all__ = [
'MobileNetV3', 'MobileNetV3_small_x0_35', 'MobileNetV3_small_x0_5',
'MobileNetV3_small_x0_75', 'MobileNetV3_small_x1_0',
'MobileNetV3_small_x1_25', 'MobileNetV3_large_x0_35',
'MobileNetV3_large_x0_5', 'MobileNetV3_large_x0_75',
'MobileNetV3_large_x1_0', 'MobileNetV3_large_x1_25'
]
class MobileNetV3():
def __init__(self, params):
self.scale = params['scale']
model_name = params['model_name']
self.inplanes = 16
if model_name == "large":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, False, 'relu', 1],
[3, 64, 24, False, 'relu', (2, 1)],
[3, 72, 24, False, 'relu', 1],
[5, 72, 40, True, 'relu', (2, 1)],
[5, 120, 40, True, 'relu', 1],
[5, 120, 40, True, 'relu', 1],
[3, 240, 80, False, 'hard_swish', 1],
[3, 200, 80, False, 'hard_swish', 1],
[3, 184, 80, False, 'hard_swish', 1],
[3, 184, 80, False, 'hard_swish', 1],
[3, 480, 112, True, 'hard_swish', 1],
[3, 672, 112, True, 'hard_swish', 1],
[5, 672, 160, True, 'hard_swish', (2, 1)],
[5, 960, 160, True, 'hard_swish', 1],
[5, 960, 160, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 960
self.cls_ch_expand = 1280
elif model_name == "small":
self.cfg = [
# k, exp, c, se, nl, s,
[3, 16, 16, True, 'relu', (2, 1)],
[3, 72, 24, False, 'relu', (2, 1)],
[3, 88, 24, False, 'relu', 1],
[5, 96, 40, True, 'hard_swish', (2, 1)],
[5, 240, 40, True, 'hard_swish', 1],
[5, 240, 40, True, 'hard_swish', 1],
[5, 120, 48, True, 'hard_swish', 1],
[5, 144, 48, True, 'hard_swish', 1],
[5, 288, 96, True, 'hard_swish', (2, 1)],
[5, 576, 96, True, 'hard_swish', 1],
[5, 576, 96, True, 'hard_swish', 1],
]
self.cls_ch_squeeze = 576
self.cls_ch_expand = 1280
else:
raise NotImplementedError("mode[" + model_name +
"_model] is not implemented!")
supported_scale = [0.35, 0.5, 0.75, 1.0, 1.25]
assert self.scale in supported_scale, \
"supported scale are {} but input scale is {}".format(supported_scale, scale)
def __call__(self, input):
scale = self.scale
inplanes = self.inplanes
cfg = self.cfg
cls_ch_squeeze = self.cls_ch_squeeze
cls_ch_expand = self.cls_ch_expand
#conv1
conv = self.conv_bn_layer(
input,
filter_size=3,
num_filters=self.make_divisible(inplanes * scale),
stride=2,
padding=1,
num_groups=1,
if_act=True,
act='hard_swish',
name='conv1')
i = 0
inplanes = self.make_divisible(inplanes * scale)
for layer_cfg in cfg:
conv = self.residual_unit(
input=conv,
num_in_filter=inplanes,
num_mid_filter=self.make_divisible(scale * layer_cfg[1]),
num_out_filter=self.make_divisible(scale * layer_cfg[2]),
act=layer_cfg[4],
stride=layer_cfg[5],
filter_size=layer_cfg[0],
use_se=layer_cfg[3],
name='conv' + str(i + 2))
inplanes = self.make_divisible(scale * layer_cfg[2])
i += 1
conv = self.conv_bn_layer(
input=conv,
filter_size=1,
num_filters=self.make_divisible(scale * cls_ch_squeeze),
stride=1,
padding=0,
num_groups=1,
if_act=True,
act='hard_swish',
name='conv_last')
conv = fluid.layers.pool2d(
input=conv,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='max')
return conv
def conv_bn_layer(self,
input,
filter_size,
num_filters,
stride,
padding,
num_groups=1,
if_act=True,
act=None,
name=None,
use_cudnn=True,
res_last_bn_init=False):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=padding,
groups=num_groups,
act=None,
use_cudnn=use_cudnn,
param_attr=ParamAttr(name=name + '_weights'),
bias_attr=False)
bn_name = name + '_bn'
bn = fluid.layers.batch_norm(
input=conv,
param_attr=ParamAttr(
name=bn_name + "_scale",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
bias_attr=ParamAttr(
name=bn_name + "_offset",
regularizer=fluid.regularizer.L2DecayRegularizer(
regularization_coeff=0.0)),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
if if_act:
if act == 'relu':
bn = fluid.layers.relu(bn)
elif act == 'hard_swish':
bn = fluid.layers.hard_swish(bn)
return bn
def make_divisible(self, v, divisor=8, min_value=None):
if min_value is None:
min_value = divisor
new_v = max(min_value, int(v + divisor / 2) // divisor * divisor)
if new_v < 0.9 * v:
new_v += divisor
return new_v
def se_block(self, input, num_out_filter, ratio=4, name=None):
num_mid_filter = num_out_filter // ratio
pool = fluid.layers.pool2d(
input=input, pool_type='avg', global_pooling=True, use_cudnn=False)
conv1 = fluid.layers.conv2d(
input=pool,
filter_size=1,
num_filters=num_mid_filter,
act='relu',
param_attr=ParamAttr(name=name + '_1_weights'),
bias_attr=ParamAttr(name=name + '_1_offset'))
conv2 = fluid.layers.conv2d(
input=conv1,
filter_size=1,
num_filters=num_out_filter,
act='hard_sigmoid',
param_attr=ParamAttr(name=name + '_2_weights'),
bias_attr=ParamAttr(name=name + '_2_offset'))
scale = fluid.layers.elementwise_mul(x=input, y=conv2, axis=0)
return scale
def residual_unit(self,
input,
num_in_filter,
num_mid_filter,
num_out_filter,
stride,
filter_size,
act=None,
use_se=False,
name=None):
conv0 = self.conv_bn_layer(
input=input,
filter_size=1,
num_filters=num_mid_filter,
stride=1,
padding=0,
if_act=True,
act=act,
name=name + '_expand')
conv1 = self.conv_bn_layer(
input=conv0,
filter_size=filter_size,
num_filters=num_mid_filter,
stride=stride,
padding=int((filter_size - 1) // 2),
if_act=True,
act=act,
num_groups=num_mid_filter,
use_cudnn=False,
name=name + '_depthwise')
if use_se:
conv1 = self.se_block(
input=conv1, num_out_filter=num_mid_filter, name=name + '_se')
conv2 = self.conv_bn_layer(
input=conv1,
filter_size=1,
num_filters=num_out_filter,
stride=1,
padding=0,
if_act=False,
name=name + '_linear',
res_last_bn_init=True)
if num_in_filter != num_out_filter or stride != 1:
return conv2
else:
return fluid.layers.elementwise_add(x=input, y=conv2, act=None)

View File

@ -0,0 +1,271 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
__all__ = [
"ResNet", "ResNet18_vd", "ResNet34_vd", "ResNet50_vd", "ResNet101_vd",
"ResNet152_vd", "ResNet200_vd"
]
class ResNet():
def __init__(self, params):
self.layers = params['layers']
self.is_3x3 = True
supported_layers = [18, 34, 50, 101, 152, 200]
assert self.layers in supported_layers, \
"supported layers are {} but input layer is {}".format(supported_layers, self.layers)
def __call__(self, input):
is_3x3 = self.is_3x3
layers = self.layers
if layers == 18:
depth = [2, 2, 2, 2]
elif layers == 34 or layers == 50:
depth = [3, 4, 6, 3]
elif layers == 101:
depth = [3, 4, 23, 3]
elif layers == 152:
depth = [3, 8, 36, 3]
elif layers == 200:
depth = [3, 12, 48, 3]
num_filters = [64, 128, 256, 512]
if is_3x3 == False:
conv = self.conv_bn_layer(
input=input,
num_filters=64,
filter_size=7,
stride=1,
act='relu')
else:
conv = self.conv_bn_layer(
input=input,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_1')
conv = self.conv_bn_layer(
input=conv,
num_filters=32,
filter_size=3,
stride=1,
act='relu',
name='conv1_2')
conv = self.conv_bn_layer(
input=conv,
num_filters=64,
filter_size=3,
stride=1,
act='relu',
name='conv1_3')
conv = fluid.layers.pool2d(
input=conv,
pool_size=3,
pool_stride=2,
pool_padding=1,
pool_type='max')
if layers >= 50:
for block in range(len(depth)):
for i in range(depth[block]):
if layers in [101, 152, 200] and block == 2:
if i == 0:
conv_name = "res" + str(block + 2) + "a"
else:
conv_name = "res" + str(block + 2) + "b" + str(i)
else:
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
else:
stride = (1, 1)
conv = self.bottleneck_block(
input=conv,
num_filters=num_filters[block],
stride=stride,
if_first=block == i == 0,
name=conv_name)
else:
for block in range(len(depth)):
for i in range(depth[block]):
conv_name = "res" + str(block + 2) + chr(97 + i)
if i == 0 and block != 0:
stride = (2, 1)
else:
stride = (1, 1)
conv = self.basic_block(
input=conv,
num_filters=num_filters[block],
stride=stride,
if_first=block == i == 0,
name=conv_name)
conv = fluid.layers.pool2d(
input=conv,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='max')
return conv
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def conv_bn_layer_new(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
pool = fluid.layers.pool2d(
input=input,
pool_size=stride,
pool_stride=stride,
pool_padding=0,
pool_type='avg',
ceil_mode=True)
conv = fluid.layers.conv2d(
input=pool,
num_filters=num_filters,
filter_size=filter_size,
stride=1,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
if name == "conv1":
bn_name = "bn_" + name
else:
bn_name = "bn" + name[3:]
return fluid.layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def shortcut(self, input, ch_out, stride, name, if_first=False):
ch_in = input.shape[1]
if ch_in != ch_out or stride[0] != 1:
if if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return self.conv_bn_layer_new(
input, ch_out, 1, stride, name=name)
elif if_first:
return self.conv_bn_layer(input, ch_out, 1, stride, name=name)
else:
return input
def bottleneck_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=1,
act='relu',
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
stride=stride,
act='relu',
name=name + "_branch2b")
conv2 = self.conv_bn_layer(
input=conv1,
num_filters=num_filters * 4,
filter_size=1,
act=None,
name=name + "_branch2c")
short = self.shortcut(
input,
num_filters * 4,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv2, act='relu')
def basic_block(self, input, num_filters, stride, name, if_first):
conv0 = self.conv_bn_layer(
input=input,
num_filters=num_filters,
filter_size=3,
act='relu',
stride=stride,
name=name + "_branch2a")
conv1 = self.conv_bn_layer(
input=conv0,
num_filters=num_filters,
filter_size=3,
act=None,
name=name + "_branch2b")
short = self.shortcut(
input,
num_filters,
stride,
if_first=if_first,
name=name + "_branch1")
return fluid.layers.elementwise_add(x=short, y=conv1, act='relu')

View File

@ -0,0 +1,95 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
import math
def get_para_bias_attr(l2_decay, k, name):
regularizer = fluid.regularizer.L2Decay(l2_decay)
stdv = 1.0 / math.sqrt(k * 1.0)
initializer = fluid.initializer.Uniform(-stdv, stdv)
para_attr = fluid.ParamAttr(
regularizer=regularizer, initializer=initializer, name=name + "_w_attr")
bias_attr = fluid.ParamAttr(
regularizer=regularizer, initializer=initializer, name=name + "_b_attr")
return [para_attr, bias_attr]
def conv_bn_layer(input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = fluid.layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
name=name + '.conv2d')
bn_name = "bn_" + name
return fluid.layers.batch_norm(
input=conv,
act=act,
name=bn_name + '.output',
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def deconv_bn_layer(input,
num_filters,
filter_size=4,
stride=2,
act='relu',
name=None):
deconv = fluid.layers.conv2d_transpose(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=1,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False,
name=name + '.deconv2d')
bn_name = "bn_" + name
return fluid.layers.batch_norm(
input=deconv,
act=act,
name=bn_name + '.output',
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def create_tmp_var(program, name, dtype, shape, lod_level=0):
return program.current_block().create_var(
name=name, dtype=dtype, shape=shape, lod_level=lod_level)

View File

@ -0,0 +1,206 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle.fluid as fluid
class DBHead(object):
"""
Differentiable Binarization (DB) for text detection:
see https://arxiv.org/abs/1911.08947
args:
params(dict): super parameters for build DB network
"""
def __init__(self, params):
self.k = params['k']
self.inner_channels = params['inner_channels']
self.C, self.H, self.W = params['image_shape']
print(self.C, self.H, self.W)
def binarize(self, x):
conv1 = fluid.layers.conv2d(
input=x,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=False)
conv_bn1 = fluid.layers.batch_norm(
input=conv1,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
act="relu")
conv2 = fluid.layers.conv2d_transpose(
input=conv_bn1,
num_filters=self.inner_channels // 4,
filter_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
act=None)
conv_bn2 = fluid.layers.batch_norm(
input=conv2,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
act="relu")
conv3 = fluid.layers.conv2d_transpose(
input=conv_bn2,
num_filters=1,
filter_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
act=None)
out = fluid.layers.sigmoid(conv3)
return out
def thresh(self, x):
conv1 = fluid.layers.conv2d(
input=x,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=False)
conv_bn1 = fluid.layers.batch_norm(
input=conv1,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
act="relu")
conv2 = fluid.layers.conv2d_transpose(
input=conv_bn1,
num_filters=self.inner_channels // 4,
filter_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn1.shape[1], "conv2"),
act=None)
conv_bn2 = fluid.layers.batch_norm(
input=conv2,
param_attr=fluid.initializer.ConstantInitializer(value=1.0),
bias_attr=fluid.initializer.ConstantInitializer(value=1e-4),
act="relu")
conv3 = fluid.layers.conv2d_transpose(
input=conv_bn2,
num_filters=1,
filter_size=2,
stride=2,
param_attr=fluid.initializer.MSRAInitializer(uniform=False),
bias_attr=self._get_bias_attr(0.0004, conv_bn2.shape[1], "conv3"),
act=None)
out = fluid.layers.sigmoid(conv3)
return out
def _get_bias_attr(self, l2_decay, k, name, gradient_clip=None):
regularizer = fluid.regularizer.L2Decay(l2_decay)
stdv = 1.0 / math.sqrt(k * 1.0)
initializer = fluid.initializer.Uniform(-stdv, stdv)
bias_attr = fluid.ParamAttr(
regularizer=regularizer,
gradient_clip=gradient_clip,
initializer=initializer,
name=name + "_b_attr")
return bias_attr
def step_function(self, x, y):
return fluid.layers.reciprocal(1 + fluid.layers.exp(-self.k * (x - y)))
def __call__(self, conv_features, mode="train"):
c2, c3, c4, c5 = conv_features
param_attr = fluid.initializer.MSRAInitializer(uniform=False)
in5 = fluid.layers.conv2d(
input=c5,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
in4 = fluid.layers.conv2d(
input=c4,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
in3 = fluid.layers.conv2d(
input=c3,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
in2 = fluid.layers.conv2d(
input=c2,
num_filters=self.inner_channels,
filter_size=1,
param_attr=param_attr,
bias_attr=False)
out4 = fluid.layers.elementwise_add(
x=fluid.layers.resize_nearest(
input=in5, scale=2), y=in4) # 1/16
out3 = fluid.layers.elementwise_add(
x=fluid.layers.resize_nearest(
input=out4, scale=2), y=in3) # 1/8
out2 = fluid.layers.elementwise_add(
x=fluid.layers.resize_nearest(
input=out3, scale=2), y=in2) # 1/4
p5 = fluid.layers.conv2d(
input=in5,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
p5 = fluid.layers.resize_nearest(input=p5, scale=8)
p4 = fluid.layers.conv2d(
input=out4,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
p4 = fluid.layers.resize_nearest(input=p4, scale=4)
p3 = fluid.layers.conv2d(
input=out3,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
p3 = fluid.layers.resize_nearest(input=p3, scale=2)
p2 = fluid.layers.conv2d(
input=out2,
num_filters=self.inner_channels // 4,
filter_size=3,
padding=1,
param_attr=param_attr,
bias_attr=False)
fuse = fluid.layers.concat(input=[p5, p4, p3, p2], axis=1)
shrink_maps = self.binarize(fuse)
if mode != "train":
return shrink_maps
threshold_maps = self.thresh(fuse)
binary_maps = self.step_function(shrink_maps, threshold_maps)
y = fluid.layers.concat(
input=[shrink_maps, threshold_maps, binary_maps], axis=1)
predicts = {}
predicts['maps'] = y
return predicts

View File

@ -0,0 +1,116 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
from ..common_functions import conv_bn_layer, deconv_bn_layer
class EASTHead(object):
"""
EAST: An Efficient and Accurate Scene Text Detector
see arxiv: https://arxiv.org/abs/1704.03155
args:
params(dict): the super parameters for network build
"""
def __init__(self, params):
self.model_name = params['model_name']
def unet_fusion(self, inputs):
f = inputs[::-1]
if self.model_name == "large":
num_outputs = [128, 128, 128, 128]
else:
num_outputs = [64, 64, 64, 64]
g = [None, None, None, None]
h = [None, None, None, None]
for i in range(4):
if i == 0:
h[i] = f[i]
else:
h[i] = fluid.layers.concat([g[i - 1], f[i]], axis=1)
h[i] = conv_bn_layer(
input=h[i],
num_filters=num_outputs[i],
filter_size=3,
stride=1,
act='relu',
name="unet_h_%d" % (i))
if i <= 2:
#can be replaced with unpool
g[i] = deconv_bn_layer(
input=h[i],
num_filters=num_outputs[i],
name="unet_g_%d" % (i))
else:
g[i] = conv_bn_layer(
input=h[i],
num_filters=num_outputs[i],
filter_size=3,
stride=1,
act='relu',
name="unet_g_%d" % (i))
return g[3]
def detector_header(self, f_common):
if self.model_name == "large":
num_outputs = [128, 64, 1, 8]
else:
num_outputs = [64, 32, 1, 8]
f_det = conv_bn_layer(
input=f_common,
num_filters=num_outputs[0],
filter_size=3,
stride=1,
act='relu',
name="det_head1")
f_det = conv_bn_layer(
input=f_det,
num_filters=num_outputs[1],
filter_size=3,
stride=1,
act='relu',
name="det_head2")
#f_score
f_score = conv_bn_layer(
input=f_det,
num_filters=num_outputs[2],
filter_size=1,
stride=1,
act=None,
name="f_score")
f_score = fluid.layers.sigmoid(f_score)
#f_geo
f_geo = conv_bn_layer(
input=f_det,
num_filters=num_outputs[3],
filter_size=1,
stride=1,
act=None,
name="f_geo")
f_geo = (fluid.layers.sigmoid(f_geo) - 0.5) * 2 * 800
return f_score, f_geo
def __call__(self, inputs):
f_common = self.unet_fusion(inputs)
f_score, f_geo = self.detector_header(f_common)
predicts = {}
predicts['f_score'] = f_score
predicts['f_geo'] = f_geo
return predicts

View File

@ -0,0 +1,232 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from .rec_seq_encoder import SequenceEncoder
import numpy as np
class AttentionPredict(object):
def __init__(self, params):
super(AttentionPredict, self).__init__()
self.char_num = params['char_num']
self.encoder = SequenceEncoder(params)
self.decoder_size = params['Attention']['decoder_size']
self.word_vector_dim = params['Attention']['word_vector_dim']
self.encoder_type = params['encoder_type']
self.max_length = params['max_text_length']
def simple_attention(self, encoder_vec, encoder_proj, decoder_state,
decoder_size):
decoder_state_proj = layers.fc(input=decoder_state,
size=decoder_size,
bias_attr=False,
name="decoder_state_proj_fc")
decoder_state_expand = layers.sequence_expand(
x=decoder_state_proj, y=encoder_proj)
concated = layers.elementwise_add(encoder_proj, decoder_state_expand)
concated = layers.tanh(x=concated)
attention_weights = layers.fc(input=concated,
size=1,
act=None,
bias_attr=False,
name="attention_weights_fc")
attention_weights = layers.sequence_softmax(input=attention_weights)
weigths_reshape = layers.reshape(x=attention_weights, shape=[-1])
scaled = layers.elementwise_mul(
x=encoder_vec, y=weigths_reshape, axis=0)
context = layers.sequence_pool(input=scaled, pool_type='sum')
return context
def gru_decoder_with_attention(self, target_embedding, encoder_vec,
encoder_proj, decoder_boot, decoder_size,
char_num):
rnn = layers.DynamicRNN()
with rnn.block():
current_word = rnn.step_input(target_embedding)
encoder_vec = rnn.static_input(encoder_vec)
encoder_proj = rnn.static_input(encoder_proj)
hidden_mem = rnn.memory(init=decoder_boot, need_reorder=True)
context = self.simple_attention(encoder_vec, encoder_proj,
hidden_mem, decoder_size)
fc_1 = layers.fc(input=context,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc1")
fc_2 = layers.fc(input=current_word,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc2")
decoder_inputs = fc_1 + fc_2
h, _, _ = layers.gru_unit(
input=decoder_inputs, hidden=hidden_mem, size=decoder_size * 3)
rnn.update_memory(hidden_mem, h)
out = layers.fc(input=h,
size=char_num,
bias_attr=True,
act='softmax',
name="rnn_out_fc")
rnn.output(out)
return rnn()
def gru_attention_infer(self, decoder_boot, max_length, char_num,
word_vector_dim, encoded_vector, encoded_proj,
decoder_size):
init_state = decoder_boot
beam_size = 1
array_len = layers.fill_constant(
shape=[1], dtype='int64', value=max_length)
counter = layers.zeros(shape=[1], dtype='int64', force_cpu=True)
# fill the first element with init_state
state_array = layers.create_array('float32')
layers.array_write(init_state, array=state_array, i=counter)
# ids, scores as memory
ids_array = layers.create_array('int64')
scores_array = layers.create_array('float32')
rois_shape = layers.shape(init_state)
batch_size = layers.slice(
rois_shape, axes=[0], starts=[0], ends=[1]) + 1
lod_level = layers.range(
start=0, end=batch_size, step=1, dtype=batch_size.dtype)
init_ids = layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], value=0, dtype='int64')
init_ids = layers.lod_reset(init_ids, lod_level)
init_ids = layers.lod_append(init_ids, lod_level)
init_scores = layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], value=1, dtype='float32')
init_scores = layers.lod_reset(init_scores, init_ids)
layers.array_write(init_ids, array=ids_array, i=counter)
layers.array_write(init_scores, array=scores_array, i=counter)
full_ids = fluid.layers.fill_constant_batch_size_like(
input=init_state, shape=[-1, 1], dtype='int64', value=1)
cond = layers.less_than(x=counter, y=array_len)
while_op = layers.While(cond=cond)
with while_op.block():
pre_ids = layers.array_read(array=ids_array, i=counter)
pre_state = layers.array_read(array=state_array, i=counter)
pre_score = layers.array_read(array=scores_array, i=counter)
pre_ids_emb = layers.embedding(
input=pre_ids,
size=[char_num, word_vector_dim],
dtype='float32')
context = self.simple_attention(encoded_vector, encoded_proj,
pre_state, decoder_size)
# expand the recursive_sequence_lengths of pre_state
# to be the same with pre_score
pre_state_expanded = layers.sequence_expand(pre_state, pre_score)
context_expanded = layers.sequence_expand(context, pre_score)
fc_1 = layers.fc(input=context_expanded,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc1")
fc_2 = layers.fc(input=pre_ids_emb,
size=decoder_size * 3,
bias_attr=False,
name="rnn_fc2")
decoder_inputs = fc_1 + fc_2
current_state, _, _ = layers.gru_unit(
input=decoder_inputs,
hidden=pre_state_expanded,
size=decoder_size * 3)
current_state_with_lod = layers.lod_reset(
x=current_state, y=pre_score)
# use score to do beam search
current_score = layers.fc(input=current_state_with_lod,
size=char_num,
bias_attr=True,
act='softmax',
name="rnn_out_fc")
topk_scores, topk_indices = layers.topk(current_score, k=beam_size)
new_ids = fluid.layers.concat([full_ids, topk_indices], axis=1)
fluid.layers.assign(new_ids, full_ids)
layers.increment(x=counter, value=1, in_place=True)
# update the memories
layers.array_write(current_state, array=state_array, i=counter)
layers.array_write(topk_indices, array=ids_array, i=counter)
layers.array_write(topk_scores, array=scores_array, i=counter)
# update the break condition:
# up to the max length or all candidates of
# source sentences have ended.
length_cond = layers.less_than(x=counter, y=array_len)
finish_cond = layers.logical_not(layers.is_empty(x=topk_indices))
layers.logical_and(x=length_cond, y=finish_cond, out=cond)
return full_ids
def __call__(self, inputs, labels=None, mode=None):
encoder_features = self.encoder(inputs)
char_num = self.char_num
word_vector_dim = self.word_vector_dim
decoder_size = self.decoder_size
if self.encoder_type == "reshape":
encoder_input = encoder_features
encoded_vector = encoder_features
else:
encoder_input = encoder_features[1]
encoded_vector = layers.concat(encoder_features, axis=1)
encoded_proj = layers.fc(input=encoded_vector,
size=decoder_size,
bias_attr=False,
name="encoded_proj_fc")
backward_first = layers.sequence_pool(
input=encoder_input, pool_type='first')
decoder_boot = layers.fc(input=backward_first,
size=decoder_size,
bias_attr=False,
act="relu",
name='decoder_boot')
if mode == "train":
label_in = labels['label_in']
label_out = labels['label_out']
label_in = layers.cast(x=label_in, dtype='int64')
trg_embedding = layers.embedding(
input=label_in,
size=[char_num, word_vector_dim],
dtype='float32')
predict = self.gru_decoder_with_attention(
trg_embedding, encoded_vector, encoded_proj, decoder_boot,
decoder_size, char_num)
_, decoded_out = layers.topk(input=predict, k=1)
decoded_out = layers.lod_reset(decoded_out, y=label_out)
predicts = {'predict': predict, 'decoded_out': decoded_out}
else:
ids = self.gru_attention_infer(
decoder_boot, self.max_length, char_num, word_vector_dim,
encoded_vector, encoded_proj, decoder_size)
predicts = {'decoded_out': ids}
return predicts

View File

@ -0,0 +1,51 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
from .rec_seq_encoder import SequenceEncoder
from ..common_functions import get_para_bias_attr
import numpy as np
class CTCPredict(object):
def __init__(self, params):
super(CTCPredict, self).__init__()
self.char_num = params['char_num']
self.encoder = SequenceEncoder(params)
self.encoder_type = params['encoder_type']
def __call__(self, inputs, labels=None, mode=None):
encoder_features = self.encoder(inputs)
if self.encoder_type != "reshape":
encoder_features = fluid.layers.concat(encoder_features, axis=1)
name = "ctc_fc"
para_attr, bias_attr = get_para_bias_attr(
l2_decay=0.0004, k=encoder_features.shape[1], name=name)
predict = fluid.layers.fc(input=encoder_features,
size=self.char_num + 1,
param_attr=para_attr,
bias_attr=bias_attr,
name=name)
decoded_out = fluid.layers.ctc_greedy_decoder(
input=predict, blank=self.char_num)
predicts = {'predict': predict, 'decoded_out': decoded_out}
return predicts

View File

@ -0,0 +1,100 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle.fluid as fluid
import paddle.fluid.layers as layers
class EncoderWithReshape(object):
def __init__(self, params):
super(EncoderWithReshape, self).__init__()
def __call__(self, inputs):
sliced_feature = layers.im2sequence(
input=inputs,
stride=[1, 1],
filter_size=[inputs.shape[2], 1],
name="sliced_feature")
return sliced_feature
class EncoderWithRNN(object):
def __init__(self, params):
super(EncoderWithRNN, self).__init__()
self.rnn_hidden_size = params['SeqRNN']['hidden_size']
def __call__(self, inputs):
lstm_list = []
name_prefix = "lstm"
rnn_hidden_size = self.rnn_hidden_size
for no in range(1, 3):
if no == 1:
is_reverse = False
else:
is_reverse = True
name = "%s_st1_fc%d" % (name_prefix, no)
fc = layers.fc(input=inputs,
size=rnn_hidden_size * 4,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
name=name)
name = "%s_st1_out%d" % (name_prefix, no)
lstm, _ = layers.dynamic_lstm(
input=fc,
size=rnn_hidden_size * 4,
is_reverse=is_reverse,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
use_peepholes=False)
name = "%s_st2_fc%d" % (name_prefix, no)
fc = layers.fc(input=lstm,
size=rnn_hidden_size * 4,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
name=name)
name = "%s_st2_out%d" % (name_prefix, no)
lstm, _ = layers.dynamic_lstm(
input=fc,
size=rnn_hidden_size * 4,
is_reverse=is_reverse,
param_attr=fluid.ParamAttr(name=name + "_w"),
bias_attr=fluid.ParamAttr(name=name + "_b"),
use_peepholes=False)
lstm_list.append(lstm)
return lstm_list
class SequenceEncoder(object):
def __init__(self, params):
super(SequenceEncoder, self).__init__()
self.encoder_type = params['encoder_type']
self.encoder_reshape = EncoderWithReshape(params)
if self.encoder_type == "rnn":
self.encoder_rnn = EncoderWithRNN(params)
def __call__(self, inputs):
if self.encoder_type == "reshape":
encoder_features = self.encoder_reshape(inputs)
elif self.encoder_type == "rnn":
inputs = self.encoder_reshape(inputs)
encoder_features = self.encoder_rnn(inputs)
else:
assert False, "Unsupport encoder_type:%s"\
% self.encoder_type
return encoder_features

View File

@ -0,0 +1,116 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
import paddle.fluid as fluid
def BalanceLoss(pred,
gt,
mask,
balance_loss=True,
main_loss_type="DiceLoss",
negative_ratio=3,
return_origin=False,
eps=1e-6):
"""
The BalanceLoss for Differentiable Binarization text detection
args:
pred (variable): predicted feature maps.
gt (variable): ground truth feature maps.
mask (variable): masked maps.
balance_loss (bool): whether balance loss or not, default is True
main_loss_type (str): can only be one of ['CrossEntropy','DiceLoss',
'Euclidean','BCELoss', 'MaskL1Loss'], default is 'DiceLoss'.
negative_ratio (int|float): float, default is 3.
return_origin (bool): whether return unbalanced loss or not, default is False.
eps (float): default is 1e-6.
return: (variable) balanced loss
"""
positive = gt * mask
negative = (1 - gt) * mask
positive_count = fluid.layers.reduce_sum(positive)
positive_count_int = fluid.layers.cast(positive_count, dtype=np.int32)
negative_count = min(
fluid.layers.reduce_sum(negative), positive_count * negative_ratio)
negative_count_int = fluid.layers.cast(negative_count, dtype=np.int32)
if main_loss_type == "CrossEntropy":
loss = fluid.layers.cross_entropy(input=pred, label=gt, soft_label=True)
loss = fluid.layers.reduce_mean(loss)
elif main_loss_type == "Euclidean":
loss = fluid.layers.square(pred - gt)
loss = fluid.layers.reduce_mean(loss)
elif main_loss_type == "DiceLoss":
loss = DiceLoss(pred, gt, mask)
elif main_loss_type == "BCELoss":
loss = fluid.layers.sigmoid_cross_entropy_with_logits(pred, label=gt)
elif main_loss_type == "MaskL1Loss":
loss = MaskL1Loss(pred, gt, mask)
else:
loss_type = [
'CrossEntropy', 'DiceLoss', 'Euclidean', 'BCELoss', 'MaskL1Loss'
]
raise Exception("main_loss_type in BalanceLoss() can only be one of {}".
format(loss_type))
if not balance_loss:
return loss
positive_loss = positive * loss
negative_loss = negative * loss
negative_loss = fluid.layers.reshape(negative_loss, shape=[-1])
negative_loss, _ = fluid.layers.topk(negative_loss, k=negative_count_int)
balance_loss = (fluid.layers.reduce_sum(positive_loss) +
fluid.layers.reduce_sum(negative_loss)) / (
positive_count + negative_count + eps)
if return_origin:
return balance_loss, loss
return balance_loss
def DiceLoss(pred, gt, mask, weights=None, eps=1e-6):
"""
DiceLoss function.
"""
assert pred.shape == gt.shape
assert pred.shape == mask.shape
if weights is not None:
assert weights.shape == mask.shape
mask = weights * mask
intersection = fluid.layers.reduce_sum(pred * gt * mask)
union = fluid.layers.reduce_sum(pred * mask) + fluid.layers.reduce_sum(
gt * mask) + eps
loss = 1 - 2.0 * intersection / union
assert loss <= 1
return loss
def MaskL1Loss(pred, gt, mask, eps=1e-6):
"""
Mask L1 Loss
"""
loss = fluid.layers.reduce_sum((fluid.layers.abs(pred - gt) * mask)) / (
fluid.layers.reduce_sum(mask) + eps)
loss = fluid.layers.reduce_mean(loss)
return loss

View File

@ -0,0 +1,68 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from .det_basic_loss import BalanceLoss, MaskL1Loss, DiceLoss
class DBLoss(object):
"""
Differentiable Binarization (DB) Loss Function
args:
param (dict): the super paramter for DB Loss
"""
def __init__(self, params):
super(DBLoss, self).__init__()
self.balance_loss = params['balance_loss']
self.main_loss_type = params['main_loss_type']
self.alpha = params['alpha']
self.beta = params['beta']
self.ohem_ratio = params['ohem_ratio']
def __call__(self, predicts, labels):
label_shrink_map = labels['shrink_map']
label_shrink_mask = labels['shrink_mask']
label_threshold_map = labels['threshold_map']
label_threshold_mask = labels['threshold_mask']
pred = predicts['maps']
shrink_maps = pred[:, 0, :, :]
threshold_maps = pred[:, 1, :, :]
binary_maps = pred[:, 2, :, :]
loss_shrink_maps = BalanceLoss(
shrink_maps,
label_shrink_map,
label_shrink_mask,
balance_loss=self.balance_loss,
main_loss_type=self.main_loss_type,
negative_ratio=self.ohem_ratio)
loss_threshold_maps = MaskL1Loss(threshold_maps, label_threshold_map,
label_threshold_mask)
loss_binary_maps = DiceLoss(binary_maps, label_shrink_map,
label_shrink_mask)
loss_shrink_maps = self.alpha * loss_shrink_maps
loss_threshold_maps = self.beta * loss_threshold_maps
loss_all = loss_shrink_maps + loss_threshold_maps\
+ loss_binary_maps
losses = {'total_loss':loss_all,\
"loss_shrink_maps":loss_shrink_maps,\
"loss_threshold_maps":loss_threshold_maps,\
"loss_binary_maps":loss_binary_maps}
return losses

View File

@ -0,0 +1,61 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
class EASTLoss(object):
"""
EAST Loss function
"""
def __init__(self, params=None):
super(EASTLoss, self).__init__()
def __call__(self, predicts, labels):
f_score = predicts['f_score']
f_geo = predicts['f_geo']
l_score = labels['score']
l_geo = labels['geo']
l_mask = labels['mask']
##dice_loss
intersection = fluid.layers.reduce_sum(f_score * l_score * l_mask)
union = fluid.layers.reduce_sum(f_score * l_mask)\
+ fluid.layers.reduce_sum(l_score * l_mask)
dice_loss = 1 - 2 * intersection / (union + 1e-5)
#smoooth_l1_loss
channels = 8
l_geo_split = fluid.layers.split(
l_geo, num_or_sections=channels + 1, dim=1)
f_geo_split = fluid.layers.split(f_geo, num_or_sections=channels, dim=1)
smooth_l1 = 0
for i in range(0, channels):
geo_diff = l_geo_split[i] - f_geo_split[i]
abs_geo_diff = fluid.layers.abs(geo_diff)
smooth_l1_sign = fluid.layers.less_than(abs_geo_diff, l_score)
smooth_l1_sign = fluid.layers.cast(smooth_l1_sign, dtype='float32')
in_loss = abs_geo_diff * abs_geo_diff * smooth_l1_sign + \
(abs_geo_diff - 0.5) * (1.0 - smooth_l1_sign)
out_loss = l_geo_split[-1] / channels * in_loss * l_score
smooth_l1 += out_loss
smooth_l1_loss = fluid.layers.reduce_mean(smooth_l1 * l_score)
dice_loss = dice_loss * 0.01
total_loss = dice_loss + smooth_l1_loss
losses = {'total_loss':total_loss, "dice_loss":dice_loss,\
"smooth_l1_loss":smooth_l1_loss}
return losses

View File

@ -0,0 +1,38 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
from paddle.fluid.param_attr import ParamAttr
import numpy as np
class AttentionLoss(object):
def __init__(self, params):
super(AttentionLoss, self).__init__()
self.char_num = params['char_num']
def __call__(self, predicts, labels):
predict = predicts['predict']
label_out = labels['label_out']
label_out = fluid.layers.cast(x=label_out, dtype='int64')
cost = fluid.layers.cross_entropy(input=predict, label=label_out)
sum_cost = fluid.layers.reduce_sum(cost)
return sum_cost

View File

@ -0,0 +1,36 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle
import paddle.fluid as fluid
class CTCLoss(object):
def __init__(self, params):
super(CTCLoss, self).__init__()
self.char_num = params['char_num']
def __call__(self, predicts, labels):
predict = predicts['predict']
label = labels['label']
cost = fluid.layers.warpctc(
input=predict, label=label, blank=self.char_num, norm_by_times=True)
sum_cost = fluid.layers.reduce_sum(cost)
return sum_cost

261
ppocr/modeling/stns/tps.py Executable file
View File

@ -0,0 +1,261 @@
#copyright (c) 2019 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import math
import paddle.fluid as fluid
import paddle.fluid.layers as layers
from paddle.fluid.param_attr import ParamAttr
import numpy as np
class LocalizationNetwork(object):
def __init__(self, params):
super(LocalizationNetwork, self).__init__()
self.F = params['num_fiducial']
self.loc_lr = params['loc_lr']
self.model_name = params['model_name']
def conv_bn_layer(self,
input,
num_filters,
filter_size,
stride=1,
groups=1,
act=None,
name=None):
conv = layers.conv2d(
input=input,
num_filters=num_filters,
filter_size=filter_size,
stride=stride,
padding=(filter_size - 1) // 2,
groups=groups,
act=None,
param_attr=ParamAttr(name=name + "_weights"),
bias_attr=False)
bn_name = "bn_" + name
return layers.batch_norm(
input=conv,
act=act,
param_attr=ParamAttr(name=bn_name + '_scale'),
bias_attr=ParamAttr(bn_name + '_offset'),
moving_mean_name=bn_name + '_mean',
moving_variance_name=bn_name + '_variance')
def get_initial_fiducials(self):
""" see RARE paper Fig. 6 (a) """
F = self.F
ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
ctrl_pts_y_top = np.linspace(0.0, -1.0, num=int(F / 2))
ctrl_pts_y_bottom = np.linspace(1.0, 0.0, num=int(F / 2))
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
initial_bias = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
return initial_bias
def __call__(self, image):
F = self.F
loc_lr = self.loc_lr
if self.model_name == "large":
num_filters_list = [64, 128, 256, 512]
fc_dim = 256
else:
num_filters_list = [16, 32, 64, 128]
fc_dim = 64
for fno in range(len(num_filters_list)):
num_filters = num_filters_list[fno]
name = "loc_conv%d" % fno
if fno == 0:
conv = self.conv_bn_layer(
image, num_filters, 3, act='relu', name=name)
else:
conv = self.conv_bn_layer(
pool, num_filters, 3, act='relu', name=name)
if fno == len(num_filters_list) - 1:
pool = layers.adaptive_pool2d(
input=conv, pool_size=[1, 1], pool_type='avg')
else:
pool = layers.pool2d(
input=conv,
pool_size=2,
pool_stride=2,
pool_padding=0,
pool_type='max')
name = "loc_fc1"
stdv = 1.0 / math.sqrt(pool.shape[1] * 1.0)
fc1 = layers.fc(input=pool,
size=fc_dim,
param_attr=fluid.param_attr.ParamAttr(
learning_rate=loc_lr,
initializer=fluid.initializer.Uniform(-stdv, stdv),
name=name + "_w"),
act='relu',
name=name)
initial_bias = self.get_initial_fiducials()
initial_bias = initial_bias.reshape(-1)
name = "loc_fc2"
param_attr = fluid.param_attr.ParamAttr(
learning_rate=loc_lr,
initializer=fluid.initializer.NumpyArrayInitializer(
np.zeros([fc_dim, F * 2])),
name=name + "_w")
bias_attr = fluid.param_attr.ParamAttr(
learning_rate=loc_lr,
initializer=fluid.initializer.NumpyArrayInitializer(initial_bias),
name=name + "_b")
fc2 = layers.fc(input=fc1,
size=F * 2,
param_attr=param_attr,
bias_attr=bias_attr,
name=name)
batch_C_prime = layers.reshape(x=fc2, shape=[-1, F, 2], inplace=False)
return batch_C_prime
class GridGenerator(object):
def __init__(self, params):
super(GridGenerator, self).__init__()
self.eps = 1e-6
self.F = params['num_fiducial']
def build_C(self):
""" Return coordinates of fiducial points in I_r; C """
F = self.F
ctrl_pts_x = np.linspace(-1.0, 1.0, int(F / 2))
ctrl_pts_y_top = -1 * np.ones(int(F / 2))
ctrl_pts_y_bottom = np.ones(int(F / 2))
ctrl_pts_top = np.stack([ctrl_pts_x, ctrl_pts_y_top], axis=1)
ctrl_pts_bottom = np.stack([ctrl_pts_x, ctrl_pts_y_bottom], axis=1)
C = np.concatenate([ctrl_pts_top, ctrl_pts_bottom], axis=0)
return C # F x 2
def build_P(self, I_r_size):
I_r_width, I_r_height = I_r_size
I_r_grid_x = (np.arange(-I_r_width, I_r_width, 2) + 1.0)\
/ I_r_width # self.I_r_width
I_r_grid_y = (np.arange(-I_r_height, I_r_height, 2) + 1.0)\
/ I_r_height # self.I_r_height
# P: self.I_r_width x self.I_r_height x 2
P = np.stack(np.meshgrid(I_r_grid_x, I_r_grid_y), axis=2)
# n (= self.I_r_width x self.I_r_height) x 2
return P.reshape([-1, 2])
def build_inv_delta_C(self, C):
""" Return inv_delta_C which is needed to calculate T """
F = self.F
hat_C = np.zeros((F, F), dtype=float) # F x F
for i in range(0, F):
for j in range(i, F):
r = np.linalg.norm(C[i] - C[j])
hat_C[i, j] = r
hat_C[j, i] = r
np.fill_diagonal(hat_C, 1)
hat_C = (hat_C**2) * np.log(hat_C)
# print(C.shape, hat_C.shape)
delta_C = np.concatenate( # F+3 x F+3
[
np.concatenate(
[np.ones((F, 1)), C, hat_C], axis=1), # F x F+3
np.concatenate(
[np.zeros((2, 3)), np.transpose(C)], axis=1), # 2 x F+3
np.concatenate(
[np.zeros((1, 3)), np.ones((1, F))], axis=1) # 1 x F+3
],
axis=0)
inv_delta_C = np.linalg.inv(delta_C)
return inv_delta_C # F+3 x F+3
def build_P_hat(self, C, P):
F = self.F
eps = self.eps
n = P.shape[0] # n (= self.I_r_width x self.I_r_height)
#P_tile: n x 2 -> n x 1 x 2 -> n x F x 2
P_tile = np.tile(np.expand_dims(P, axis=1), (1, F, 1))
C_tile = np.expand_dims(C, axis=0) # 1 x F x 2
P_diff = P_tile - C_tile # n x F x 2
#rbf_norm: n x F
rbf_norm = np.linalg.norm(P_diff, ord=2, axis=2, keepdims=False)
#rbf: n x F
rbf = np.multiply(np.square(rbf_norm), np.log(rbf_norm + eps))
P_hat = np.concatenate([np.ones((n, 1)), P, rbf], axis=1)
return P_hat # n x F+3
def get_expand_tensor(self, batch_C_prime):
name = "ex_fc"
initializer = fluid.initializer.ConstantInitializer(value=0.0)
param_attr = fluid.param_attr.ParamAttr(
learning_rate=0.0, initializer=initializer, name=name + "_w")
bias_attr = fluid.param_attr.ParamAttr(
learning_rate=0.0, initializer=initializer, name=name + "_b")
batch_C_ex_part_tensor = fluid.layers.fc(input=batch_C_prime,
size=6,
param_attr=param_attr,
bias_attr=bias_attr,
name=name)
batch_C_ex_part_tensor = fluid.layers.reshape(
x=batch_C_ex_part_tensor, shape=[-1, 3, 2])
return batch_C_ex_part_tensor
def __call__(self, batch_C_prime, I_r_size):
C = self.build_C()
P = self.build_P(I_r_size)
inv_delta_C = self.build_inv_delta_C(C).astype('float32')
P_hat = self.build_P_hat(C, P).astype('float32')
inv_delta_C_tensor = layers.create_tensor(dtype='float32')
layers.assign(inv_delta_C, inv_delta_C_tensor)
inv_delta_C_tensor.stop_gradient = True
P_hat_tensor = layers.create_tensor(dtype='float32')
layers.assign(P_hat, P_hat_tensor)
P_hat_tensor.stop_gradient = True
batch_C_ex_part_tensor = self.get_expand_tensor(batch_C_prime)
# batch_C_ex_part_tensor = create_tmp_var(
# fluid.default_main_program(),
# name='batch_C_ex_part_tensor',
# dtype='float32', shape=[-1, 3, 2])
# layers.py_func(func=get_batch_C_expand,
# x=[batch_C_prime], out=[batch_C_ex_part_tensor])
batch_C_ex_part_tensor.stop_gradient = True
batch_C_prime_with_zeros = layers.concat(
[batch_C_prime, batch_C_ex_part_tensor], axis=1)
batch_T = layers.matmul(inv_delta_C_tensor, batch_C_prime_with_zeros)
batch_P_prime = layers.matmul(P_hat_tensor, batch_T)
return batch_P_prime
class TPS(object):
def __init__(self, params):
super(TPS, self).__init__()
self.loc_net = LocalizationNetwork(params)
self.grid_generator = GridGenerator(params)
def __call__(self, image):
batch_C_prime = self.loc_net(image)
I_r_size = [image.shape[3], image.shape[2]]
batch_P_prime = self.grid_generator(batch_C_prime, I_r_size)
batch_P_prime = layers.reshape(
x=batch_P_prime, shape=[-1, image.shape[2], image.shape[3], 2])
batch_I_r = layers.grid_sampler(x=image, grid=batch_P_prime)
image.stop_gradient = False
return batch_I_r

36
ppocr/optimizer.py Executable file
View File

@ -0,0 +1,36 @@
#copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle.fluid as fluid
def AdamDecay(params, parameter_list=None):
"""
define optimizer function
args:
params(dict): the super parameters
parameter_list (list): list of Variable names to update to minimize loss
return:
"""
base_lr = params['base_lr']
beta1 = params['beta1']
beta2 = params['beta2']
optimizer = fluid.optimizer.Adam(
learning_rate=base_lr,
beta1=beta1,
beta2=beta2,
parameter_list=parameter_list)
return optimizer

View File

@ -0,0 +1,152 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import paddle
import paddle.fluid as fluid
import numpy as np
import string
import cv2
from shapely.geometry import Polygon
import pyclipper
class DBPostProcess(object):
"""
The post process for Differentiable Binarization (DB).
"""
def __init__(self, params):
self.thresh = params['thresh']
self.box_thresh = params['box_thresh']
self.max_candidates = params['max_candidates']
self.min_size = 3
def boxes_from_bitmap(self, pred, _bitmap, dest_width, dest_height):
'''
_bitmap: single map with shape (1, H, W),
whose values are binarized as {0, 1}
'''
bitmap = _bitmap
height, width = bitmap.shape
# img, contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8), cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
contours, _ = cv2.findContours((bitmap * 255).astype(np.uint8),
cv2.RETR_LIST, cv2.CHAIN_APPROX_SIMPLE)
num_contours = min(len(contours), self.max_candidates)
boxes = np.zeros((num_contours, 4, 2), dtype=np.int16)
scores = np.zeros((num_contours, ), dtype=np.float32)
for index in range(num_contours):
contour = contours[index]
points, sside = self.get_mini_boxes(contour)
if sside < self.min_size:
continue
points = np.array(points)
score = self.box_score_fast(pred, points.reshape(-1, 2))
if self.box_thresh > score:
continue
box = self.unclip(points).reshape(-1, 1, 2)
box, sside = self.get_mini_boxes(box)
if sside < self.min_size + 2:
continue
box = np.array(box)
if not isinstance(dest_width, int):
dest_width = dest_width.item()
dest_height = dest_height.item()
box[:, 0] = np.clip(
np.round(box[:, 0] / width * dest_width), 0, dest_width)
box[:, 1] = np.clip(
np.round(box[:, 1] / height * dest_height), 0, dest_height)
boxes[index, :, :] = box.astype(np.int16)
scores[index] = score
return boxes, scores
def unclip(self, box, unclip_ratio=1.5):
poly = Polygon(box)
distance = poly.area * unclip_ratio / poly.length
offset = pyclipper.PyclipperOffset()
offset.AddPath(box, pyclipper.JT_ROUND, pyclipper.ET_CLOSEDPOLYGON)
expanded = np.array(offset.Execute(distance))
return expanded
def get_mini_boxes(self, contour):
bounding_box = cv2.minAreaRect(contour)
points = sorted(list(cv2.boxPoints(bounding_box)), key=lambda x: x[0])
index_1, index_2, index_3, index_4 = 0, 1, 2, 3
if points[1][1] > points[0][1]:
index_1 = 0
index_4 = 1
else:
index_1 = 1
index_4 = 0
if points[3][1] > points[2][1]:
index_2 = 2
index_3 = 3
else:
index_2 = 3
index_3 = 2
box = [
points[index_1], points[index_2], points[index_3], points[index_4]
]
return box, min(bounding_box[1])
def box_score_fast(self, bitmap, _box):
h, w = bitmap.shape[:2]
box = _box.copy()
xmin = np.clip(np.floor(box[:, 0].min()).astype(np.int), 0, w - 1)
xmax = np.clip(np.ceil(box[:, 0].max()).astype(np.int), 0, w - 1)
ymin = np.clip(np.floor(box[:, 1].min()).astype(np.int), 0, h - 1)
ymax = np.clip(np.ceil(box[:, 1].max()).astype(np.int), 0, h - 1)
mask = np.zeros((ymax - ymin + 1, xmax - xmin + 1), dtype=np.uint8)
box[:, 0] = box[:, 0] - xmin
box[:, 1] = box[:, 1] - ymin
cv2.fillPoly(mask, box.reshape(1, -1, 2).astype(np.int32), 1)
return cv2.mean(bitmap[ymin:ymax + 1, xmin:xmax + 1], mask)[0]
def __call__(self, outs_dict, ratio_list):
pred = outs_dict['maps']
pred = pred[:, 0, :, :]
segmentation = pred > self.thresh
boxes_batch = []
for batch_index in range(pred.shape[0]):
height, width = pred.shape[-2:]
tmp_boxes, tmp_scores = self.boxes_from_bitmap(
pred[batch_index], segmentation[batch_index], width, height)
boxes = []
for k in range(len(tmp_boxes)):
if tmp_scores[k] > self.box_thresh:
boxes.append(tmp_boxes[k])
if len(boxes) > 0:
boxes = np.array(boxes)
ratio_h, ratio_w = ratio_list[batch_index]
boxes[:, :, 0] = boxes[:, :, 0] / ratio_w
boxes[:, :, 1] = boxes[:, :, 1] / ratio_h
boxes_batch.append(boxes)
return boxes_batch

View File

@ -0,0 +1,121 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import numpy as np
from .locality_aware_nms import nms_locality
import cv2
class EASTPostPocess(object):
"""
The post process for EAST.
"""
def __init__(self, params):
self.score_thresh = params['score_thresh']
self.cover_thresh = params['cover_thresh']
self.nms_thresh = params['nms_thresh']
def restore_rectangle_quad(self, origin, geometry):
"""
Restore rectangle from quadrangle.
"""
# quad
origin_concat = np.concatenate(
(origin, origin, origin, origin), axis=1) # (n, 8)
pred_quads = origin_concat - geometry
pred_quads = pred_quads.reshape((-1, 4, 2)) # (n, 4, 2)
return pred_quads
def detect(self,
score_map,
geo_map,
score_thresh=0.8,
cover_thresh=0.1,
nms_thresh=0.2):
"""
restore text boxes from score map and geo map
"""
score_map = score_map[0]
geo_map = np.swapaxes(geo_map, 1, 0)
geo_map = np.swapaxes(geo_map, 1, 2)
# filter the score map
xy_text = np.argwhere(score_map > score_thresh)
if len(xy_text) == 0:
return []
# sort the text boxes via the y axis
xy_text = xy_text[np.argsort(xy_text[:, 0])]
#restore quad proposals
text_box_restored = self.restore_rectangle_quad(
xy_text[:, ::-1] * 4, geo_map[xy_text[:, 0], xy_text[:, 1], :])
boxes = np.zeros((text_box_restored.shape[0], 9), dtype=np.float32)
boxes[:, :8] = text_box_restored.reshape((-1, 8))
boxes[:, 8] = score_map[xy_text[:, 0], xy_text[:, 1]]
boxes = nms_locality(boxes.astype(np.float64), nms_thresh)
if boxes.shape[0] == 0:
return []
# Here we filter some low score boxes by the average score map,
# this is different from the orginal paper.
for i, box in enumerate(boxes):
mask = np.zeros_like(score_map, dtype=np.uint8)
cv2.fillPoly(mask, box[:8].reshape(
(-1, 4, 2)).astype(np.int32) // 4, 1)
boxes[i, 8] = cv2.mean(score_map, mask)[0]
boxes = boxes[boxes[:, 8] > cover_thresh]
return boxes
def sort_poly(self, p):
"""
Sort polygons.
"""
min_axis = np.argmin(np.sum(p, axis=1))
p = p[[min_axis, (min_axis + 1) % 4,\
(min_axis + 2) % 4, (min_axis + 3) % 4]]
if abs(p[0, 0] - p[1, 0]) > abs(p[0, 1] - p[1, 1]):
return p
else:
return p[[0, 3, 2, 1]]
def __call__(self, outs_dict, ratio_list):
score_list = outs_dict['f_score']
geo_list = outs_dict['f_geo']
img_num = len(ratio_list)
dt_boxes_list = []
for ino in range(img_num):
score = score_list[ino]
geo = geo_list[ino]
boxes = self.detect(
score_map=score,
geo_map=geo,
score_thresh=self.score_thresh,
cover_thresh=self.cover_thresh,
nms_thresh=self.nms_thresh)
boxes_norm = []
if len(boxes) > 0:
ratio_h, ratio_w = ratio_list[ino]
boxes = boxes[:, :8].reshape((-1, 4, 2))
boxes[:, :, 0] /= ratio_w
boxes[:, :, 1] /= ratio_h
for i_box, box in enumerate(boxes):
box = self.sort_poly(box.astype(np.int32))
if np.linalg.norm(box[0] - box[1]) < 5 \
or np.linalg.norm(box[3] - box[0]) < 5:
continue
boxes_norm.append(box)
dt_boxes_list.append(np.array(boxes_norm))
return dt_boxes_list

View File

@ -0,0 +1,199 @@
"""
Locality aware nms.
"""
import numpy as np
from shapely.geometry import Polygon
def intersection(g, p):
"""
Intersection.
"""
g = Polygon(g[:8].reshape((4, 2)))
p = Polygon(p[:8].reshape((4, 2)))
g = g.buffer(0)
p = p.buffer(0)
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
union = g.area + p.area - inter
if union == 0:
return 0
else:
return inter / union
def intersection_iog(g, p):
"""
Intersection_iog.
"""
g = Polygon(g[:8].reshape((4, 2)))
p = Polygon(p[:8].reshape((4, 2)))
if not g.is_valid or not p.is_valid:
return 0
inter = Polygon(g).intersection(Polygon(p)).area
#union = g.area + p.area - inter
union = p.area
if union == 0:
print("p_area is very small")
return 0
else:
return inter / union
def weighted_merge(g, p):
"""
Weighted merge.
"""
g[:8] = (g[8] * g[:8] + p[8] * p[:8]) / (g[8] + p[8])
g[8] = (g[8] + p[8])
return g
def standard_nms(S, thres):
"""
Standard nms.
"""
order = np.argsort(S[:, 8])[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
inds = np.where(ovr <= thres)[0]
order = order[inds + 1]
return S[keep]
def standard_nms_inds(S, thres):
"""
Standard nms, retun inds.
"""
order = np.argsort(S[:, 8])[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
inds = np.where(ovr <= thres)[0]
order = order[inds + 1]
return keep
def nms(S, thres):
"""
nms.
"""
order = np.argsort(S[:, 8])[::-1]
keep = []
while order.size > 0:
i = order[0]
keep.append(i)
ovr = np.array([intersection(S[i], S[t]) for t in order[1:]])
inds = np.where(ovr <= thres)[0]
order = order[inds + 1]
return keep
def soft_nms(boxes_in, Nt_thres=0.3, threshold=0.8, sigma=0.5, method=2):
"""
soft_nms
:para boxes_in, N x 9 (coords + score)
:para threshould, eliminate cases min score(0.001)
:para Nt_thres, iou_threshi
:para sigma, gaussian weght
:method, linear or gaussian
"""
boxes = boxes_in.copy()
N = boxes.shape[0]
if N is None or N < 1:
return np.array([])
pos, maxpos = 0, 0
weight = 0.0
inds = np.arange(N)
tbox, sbox = boxes[0].copy(), boxes[0].copy()
for i in range(N):
maxscore = boxes[i, 8]
maxpos = i
tbox = boxes[i].copy()
ti = inds[i]
pos = i + 1
#get max box
while pos < N:
if maxscore < boxes[pos, 8]:
maxscore = boxes[pos, 8]
maxpos = pos
pos = pos + 1
#add max box as a detection
boxes[i, :] = boxes[maxpos, :]
inds[i] = inds[maxpos]
#swap
boxes[maxpos, :] = tbox
inds[maxpos] = ti
tbox = boxes[i].copy()
pos = i + 1
#NMS iteration
while pos < N:
sbox = boxes[pos].copy()
ts_iou_val = intersection(tbox, sbox)
if ts_iou_val > 0:
if method == 1:
if ts_iou_val > Nt_thres:
weight = 1 - ts_iou_val
else:
weight = 1
elif method == 2:
weight = np.exp(-1.0 * ts_iou_val**2 / sigma)
else:
if ts_iou_val > Nt_thres:
weight = 0
else:
weight = 1
boxes[pos, 8] = weight * boxes[pos, 8]
#if box score falls below thresold, discard the box by
#swaping last box update N
if boxes[pos, 8] < threshold:
boxes[pos, :] = boxes[N - 1, :]
inds[pos] = inds[N - 1]
N = N - 1
pos = pos - 1
pos = pos + 1
return boxes[:N]
def nms_locality(polys, thres=0.3):
"""
locality aware nms of EAST
:param polys: a N*9 numpy array. first 8 coordinates, then prob
:return: boxes after nms
"""
S = []
p = None
for g in polys:
if p is not None and intersection(g, p) > thres:
p = weighted_merge(g, p)
else:
if p is not None:
S.append(p)
p = g
if p is not None:
S.append(p)
if len(S) == 0:
return np.array([])
return standard_nms(np.array(S), thres)
if __name__ == '__main__':
# 343,350,448,135,474,143,369,359
print(
Polygon(np.array([[343, 350], [448, 135], [474, 143], [369, 359]]))
.area)

13
ppocr/utils/__init__.py Executable file
View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

171
ppocr/utils/character.py Executable file
View File

@ -0,0 +1,171 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import numpy as np
import string
import re
from .check import check_config_params
import sys
class CharacterOps(object):
""" Convert between text-label and text-index """
def __init__(self, config):
self.character_type = config['character_type']
self.loss_type = config['loss_type']
if self.character_type == "en":
self.character_str = "0123456789abcdefghijklmnopqrstuvwxyz"
dict_character = list(self.character_str)
elif self.character_type == "ch":
character_dict_path = config['character_dict_path']
self.character_str = ""
with open(character_dict_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
line = line.decode('utf-8').strip("\n")
self.character_str += line
dict_character = list(self.character_str)
elif self.character_type == "en_sensitive":
# same with ASTER setting (use 94 char).
self.character_str = string.printable[:-6]
dict_character = list(self.character_str)
else:
self.character_str = None
assert self.character_str is not None, \
"Nonsupport type of the character: {}".format(self.character_str)
self.beg_str = "sos"
self.end_str = "eos"
if self.loss_type == "attention":
dict_character = [self.beg_str, self.end_str] + dict_character
self.dict = {}
for i, char in enumerate(dict_character):
self.dict[char] = i
self.character = dict_character
def encode(self, text):
"""convert text-label into text-index.
input:
text: text labels of each image. [batch_size]
output:
text: concatenated text index for CTCLoss.
[sum(text_lengths)] = [text_index_0 + text_index_1 + ... + text_index_(n - 1)]
length: length of each text. [batch_size]
"""
if self.character_type == "en":
text = text.lower()
text_list = []
for char in text:
if char not in self.dict:
continue
text_list.append(self.dict[char])
text = np.array(text_list)
return text
def decode(self, text_index, is_remove_duplicate=False):
""" convert text-index into text-label. """
char_list = []
char_num = self.get_char_num()
if self.loss_type == "attention":
beg_idx = self.get_beg_end_flag_idx("beg")
end_idx = self.get_beg_end_flag_idx("end")
ignored_tokens = [beg_idx, end_idx]
else:
ignored_tokens = [char_num]
for idx in range(len(text_index)):
if text_index[idx] in ignored_tokens:
continue
if is_remove_duplicate:
if idx > 0 and text_index[idx - 1] == text_index[idx]:
continue
char_list.append(self.character[text_index[idx]])
text = ''.join(char_list)
return text
def get_char_num(self):
return len(self.character)
def get_beg_end_flag_idx(self, beg_or_end):
if self.loss_type == "attention":
if beg_or_end == "beg":
idx = np.array(self.dict[self.beg_str])
elif beg_or_end == "end":
idx = np.array(self.dict[self.end_str])
else:
assert False, "Unsupport type %s in get_beg_end_flag_idx"\
% beg_or_end
return idx
else:
err = "error in get_beg_end_flag_idx when using the loss %s"\
% (self.loss_type)
assert False, err
def cal_predicts_accuracy(char_ops,
preds,
preds_lod,
labels,
labels_lod,
is_remove_duplicate=False):
acc_num = 0
img_num = 0
for ino in range(len(labels_lod) - 1):
beg_no = preds_lod[ino]
end_no = preds_lod[ino + 1]
preds_text = preds[beg_no:end_no].reshape(-1)
preds_text = char_ops.decode(preds_text, is_remove_duplicate)
beg_no = labels_lod[ino]
end_no = labels_lod[ino + 1]
labels_text = labels[beg_no:end_no].reshape(-1)
labels_text = char_ops.decode(labels_text, is_remove_duplicate)
img_num += 1
if preds_text == labels_text:
acc_num += 1
acc = acc_num * 1.0 / img_num
return acc, acc_num, img_num
def convert_rec_attention_infer_res(preds):
img_num = preds.shape[0]
target_lod = [0]
convert_ids = []
for ino in range(img_num):
end_pos = np.where(preds[ino, :] == 1)[0]
if len(end_pos) <= 1:
text_list = preds[ino, 1:]
else:
text_list = preds[ino, 1:end_pos[1]]
target_lod.append(target_lod[ino] + len(text_list))
convert_ids = convert_ids + list(text_list)
convert_ids = np.array(convert_ids)
convert_ids = convert_ids.reshape((-1, 1))
return convert_ids, target_lod
def convert_rec_label_to_lod(ori_labels):
img_num = len(ori_labels)
target_lod = [0]
convert_ids = []
for ino in range(img_num):
target_lod.append(target_lod[ino] + len(ori_labels[ino]))
convert_ids = convert_ids + list(ori_labels[ino])
convert_ids = np.array(convert_ids)
convert_ids = convert_ids.reshape((-1, 1))
return convert_ids, target_lod

33
ppocr/utils/check.py Executable file
View File

@ -0,0 +1,33 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from __future__ import unicode_literals
import sys
import paddle.fluid as fluid
import logging
logger = logging.getLogger(__name__)
def check_config_params(config, config_name, params):
for param in params:
if param not in config:
err = "param %s didn't find in %s!" % (param, config_name)
assert False, err
return

File diff suppressed because it is too large Load Diff

131
ppocr/utils/save_load.py Executable file
View File

@ -0,0 +1,131 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserve.
#
#Licensed under the Apache License, Version 2.0 (the "License");
#you may not use this file except in compliance with the License.
#You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
#Unless required by applicable law or agreed to in writing, software
#distributed under the License is distributed on an "AS IS" BASIS,
#WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
#See the License for the specific language governing permissions and
#limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import errno
import os
import shutil
import tempfile
import paddle
import paddle.fluid as fluid
from .utility import initial_logger
import re
logger = initial_logger()
def _mkdir_if_not_exist(path):
"""
mkdir if not exists, ignore the exception when multiprocess mkdir together
"""
if not os.path.exists(path):
try:
os.makedirs(path)
except OSError as e:
if e.errno == errno.EEXIST and os.path.isdir(path):
logger.warning(
'be happy if some process has already created {}'.format(
path))
else:
raise OSError('Failed to mkdir {}'.format(path))
def _load_state(path):
if os.path.exists(path + '.pdopt'):
# XXX another hack to ignore the optimizer state
tmp = tempfile.mkdtemp()
dst = os.path.join(tmp, os.path.basename(os.path.normpath(path)))
shutil.copy(path + '.pdparams', dst + '.pdparams')
state = fluid.io.load_program_state(dst)
shutil.rmtree(tmp)
else:
state = fluid.io.load_program_state(path)
return state
def load_params(exe, prog, path, ignore_params=[]):
"""
Load model from the given path.
Args:
exe (fluid.Executor): The fluid.Executor object.
prog (fluid.Program): load weight to which Program object.
path (string): URL string or loca model path.
ignore_params (list): ignore variable to load when finetuning.
It can be specified by finetune_exclude_pretrained_params
and the usage can refer to docs/advanced_tutorials/TRANSFER_LEARNING.md
"""
if not (os.path.isdir(path) or os.path.exists(path + '.pdparams')):
raise ValueError("Model pretrain path {} does not "
"exists.".format(path))
logger.info('Loading parameters from {}...'.format(path))
ignore_set = set()
state = _load_state(path)
# ignore the parameter which mismatch the shape
# between the model and pretrain weight.
all_var_shape = {}
for block in prog.blocks:
for param in block.all_parameters():
all_var_shape[param.name] = param.shape
ignore_set.update([
name for name, shape in all_var_shape.items()
if name in state and shape != state[name].shape
])
if ignore_params:
all_var_names = [var.name for var in prog.list_vars()]
ignore_list = filter(
lambda var: any([re.match(name, var) for name in ignore_params]),
all_var_names)
ignore_set.update(list(ignore_list))
if len(ignore_set) > 0:
for k in ignore_set:
if k in state:
logger.warning('variable {} not used'.format(k))
del state[k]
fluid.io.set_program_state(prog, state)
def init_model(config, program, exe):
"""
load model from checkpoint or pretrained_model
"""
checkpoints = config['Global'].get('checkpoints')
if checkpoints:
path = checkpoints
fluid.load(program, path, exe)
logger.info("Finish initing model from {}".format(path))
return
pretrain_weights = config['Global'].get('pretrain_weights')
if pretrain_weights:
path = pretrain_weights
load_params(exe, program, path)
logger.info("Finish initing model from {}".format(path))
return
def save_model(program, model_path):
"""
save model to the target path
"""
fluid.save(program, model_path)
logger.info("Already save model in {}".format(model_path))

65
ppocr/utils/stats.py Executable file
View File

@ -0,0 +1,65 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import collections
import numpy as np
import datetime
__all__ = ['TrainingStats', 'Time']
class SmoothedValue(object):
"""Track a series of values and provide access to smoothed values over a
window or the global series average.
"""
def __init__(self, window_size):
self.deque = collections.deque(maxlen=window_size)
def add_value(self, value):
self.deque.append(value)
def get_median_value(self):
return np.median(self.deque)
def Time():
return datetime.datetime.now().strftime('%Y-%m-%d %H:%M:%S.%f')
class TrainingStats(object):
def __init__(self, window_size, stats_keys):
self.smoothed_losses_and_metrics = {
key: SmoothedValue(window_size)
for key in stats_keys
}
def update(self, stats):
for k, v in self.smoothed_losses_and_metrics.items():
v.add_value(stats[k])
def get(self, extras=None):
stats = collections.OrderedDict()
if extras:
for k, v in extras.items():
stats[k] = v
for k, v in self.smoothed_losses_and_metrics.items():
stats[k] = round(v.get_median_value(), 6)
return stats
def log(self, extras=None):
d = self.get(extras)
strs = ', '.join(str(dict({x: y})).strip('{}') for x, y in d.items())
return strs

71
ppocr/utils/utility.py Executable file
View File

@ -0,0 +1,71 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import logging
def initial_logger():
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
return logger
import importlib
def create_module(module_str):
tmpss = module_str.split(",")
assert len(tmpss) == 2, "Error formate\
of the module path: {}".format(module_str)
module_name, function_name = tmpss[0], tmpss[1]
somemodule = importlib.import_module(module_name, __package__)
function = getattr(somemodule, function_name)
return function
def get_check_global_params(mode):
check_params = ['use_gpu', 'max_text_length', 'image_shape',\
'image_shape', 'character_type', 'loss_type']
if mode == "train_eval":
check_params = check_params + [\
'train_batch_size_per_card', 'test_batch_size_per_card']
elif mode == "test":
check_params = check_params + ['test_batch_size_per_card']
return check_params
def get_check_reader_params(mode):
check_params = []
if mode == "train_eval":
check_params = ['TrainReader', 'EvalReader']
elif mode == "test":
check_params = ['TestReader']
return check_params
from paddle import fluid
def create_multi_devices_program(program, loss_var_name):
build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = True
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_iteration_per_drop_scope = 1
compile_program = fluid.CompiledProgram(program).with_data_parallel(
loss_name=loss_var_name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
return compile_program

102
tools/eval.py Executable file
View File

@ -0,0 +1,102 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
import program
from paddle import fluid
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.data.reader_main import reader_main
from ppocr.utils.save_load import init_model
from eval_utils.eval_det_utils import eval_det_run
from eval_utils.eval_rec_utils import test_rec_benchmark
from eval_utils.eval_rec_utils import eval_rec_run
from ppocr.utils.character import CharacterOps
def main():
config = program.load_config(FLAGS.config)
program.merge_config(FLAGS.opt)
logger.info(config)
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
program.check_gpu(True)
alg = config['Global']['algorithm']
assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE']
if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']:
config['Global']['char_ops'] = CharacterOps(config['Global'])
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
startup_prog = fluid.Program()
eval_program = fluid.Program()
eval_build_outputs = program.build(
config, eval_program, startup_prog, mode='test')
eval_fetch_name_list = eval_build_outputs[1]
eval_fetch_varname_list = eval_build_outputs[2]
eval_program = eval_program.clone(for_test=True)
exe = fluid.Executor(place)
exe.run(startup_prog)
init_model(config, eval_program, exe)
if alg in ['EAST', 'DB']:
eval_reader = reader_main(config=config, mode="test")
eval_info_dict = {'program':eval_program,\
'reader':eval_reader,\
'fetch_name_list':eval_fetch_name_list,\
'fetch_varname_list':eval_fetch_varname_list}
metrics = eval_det_run(exe, config, eval_info_dict, "test")
else:
dataset = config['Global']['dataset']
assert dataset in ['lmdb', 'common']
if dataset == 'common':
eval_reader = reader_main(config=config, mode="eval")
eval_info_dict = {'program': eval_program, \
'reader': eval_reader, \
'fetch_name_list': eval_fetch_name_list, \
'fetch_varname_list': eval_fetch_varname_list}
metrics = eval_rec_run(exe, config, eval_info_dict, "eval")
print("Eval result:", metrics)
else:
eval_info_dict = {'program':eval_program,\
'fetch_name_list':eval_fetch_name_list,\
'fetch_varname_list':eval_fetch_varname_list}
test_rec_benchmark(exe, config, eval_info_dict)
if __name__ == '__main__':
parser = program.ArgsParser()
FLAGS = parser.parse_args()
main()

View File

@ -0,0 +1,13 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.

View File

@ -0,0 +1,231 @@
#!/usr/bin/env python
# -*- coding: utf-8 -*-
from collections import namedtuple
import numpy as np
from shapely.geometry import Polygon
class DetectionIoUEvaluator(object):
def __init__(self, iou_constraint=0.5, area_precision_constraint=0.5):
self.iou_constraint = iou_constraint
self.area_precision_constraint = area_precision_constraint
def evaluate_image(self, gt, pred):
def get_union(pD, pG):
return Polygon(pD).union(Polygon(pG)).area
def get_intersection_over_union(pD, pG):
return get_intersection(pD, pG) / get_union(pD, pG)
def get_intersection(pD, pG):
return Polygon(pD).intersection(Polygon(pG)).area
def compute_ap(confList, matchList, numGtCare):
correct = 0
AP = 0
if len(confList) > 0:
confList = np.array(confList)
matchList = np.array(matchList)
sorted_ind = np.argsort(-confList)
confList = confList[sorted_ind]
matchList = matchList[sorted_ind]
for n in range(len(confList)):
match = matchList[n]
if match:
correct += 1
AP += float(correct) / (n + 1)
if numGtCare > 0:
AP /= numGtCare
return AP
perSampleMetrics = {}
matchedSum = 0
Rectangle = namedtuple('Rectangle', 'xmin ymin xmax ymax')
numGlobalCareGt = 0
numGlobalCareDet = 0
arrGlobalConfidences = []
arrGlobalMatches = []
recall = 0
precision = 0
hmean = 0
detMatched = 0
iouMat = np.empty([1, 1])
gtPols = []
detPols = []
gtPolPoints = []
detPolPoints = []
# Array of Ground Truth Polygons' keys marked as don't Care
gtDontCarePolsNum = []
# Array of Detected Polygons' matched with a don't Care GT
detDontCarePolsNum = []
pairs = []
detMatchedNums = []
arrSampleConfidences = []
arrSampleMatch = []
evaluationLog = ""
# print(len(gt))
for n in range(len(gt)):
points = gt[n]['points']
# transcription = gt[n]['text']
dontCare = gt[n]['ignore']
points = Polygon(points)
points = points.buffer(0)
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
gtPol = points
gtPols.append(gtPol)
gtPolPoints.append(points)
if dontCare:
gtDontCarePolsNum.append(len(gtPols) - 1)
evaluationLog += "GT polygons: " + str(len(gtPols)) + (
" (" + str(len(gtDontCarePolsNum)) + " don't care)\n"
if len(gtDontCarePolsNum) > 0 else "\n")
for n in range(len(pred)):
points = pred[n]['points']
points = Polygon(points)
points = points.buffer(0)
if not Polygon(points).is_valid or not Polygon(points).is_simple:
continue
detPol = points
detPols.append(detPol)
detPolPoints.append(points)
if len(gtDontCarePolsNum) > 0:
for dontCarePol in gtDontCarePolsNum:
dontCarePol = gtPols[dontCarePol]
intersected_area = get_intersection(dontCarePol, detPol)
pdDimensions = Polygon(detPol).area
precision = 0 if pdDimensions == 0 else intersected_area / pdDimensions
if (precision > self.area_precision_constraint):
detDontCarePolsNum.append(len(detPols) - 1)
break
evaluationLog += "DET polygons: " + str(len(detPols)) + (
" (" + str(len(detDontCarePolsNum)) + " don't care)\n"
if len(detDontCarePolsNum) > 0 else "\n")
if len(gtPols) > 0 and len(detPols) > 0:
# Calculate IoU and precision matrixs
outputShape = [len(gtPols), len(detPols)]
iouMat = np.empty(outputShape)
gtRectMat = np.zeros(len(gtPols), np.int8)
detRectMat = np.zeros(len(detPols), np.int8)
for gtNum in range(len(gtPols)):
for detNum in range(len(detPols)):
pG = gtPols[gtNum]
pD = detPols[detNum]
iouMat[gtNum, detNum] = get_intersection_over_union(pD, pG)
for gtNum in range(len(gtPols)):
for detNum in range(len(detPols)):
if gtRectMat[gtNum] == 0 and detRectMat[
detNum] == 0 and gtNum not in gtDontCarePolsNum and detNum not in detDontCarePolsNum:
if iouMat[gtNum, detNum] > self.iou_constraint:
gtRectMat[gtNum] = 1
detRectMat[detNum] = 1
detMatched += 1
pairs.append({'gt': gtNum, 'det': detNum})
detMatchedNums.append(detNum)
evaluationLog += "Match GT #" + \
str(gtNum) + " with Det #" + str(detNum) + "\n"
numGtCare = (len(gtPols) - len(gtDontCarePolsNum))
numDetCare = (len(detPols) - len(detDontCarePolsNum))
if numGtCare == 0:
recall = float(1)
precision = float(0) if numDetCare > 0 else float(1)
else:
recall = float(detMatched) / numGtCare
precision = 0 if numDetCare == 0 else float(detMatched) / numDetCare
hmean = 0 if (precision + recall) == 0 else 2.0 * \
precision * recall / (precision + recall)
matchedSum += detMatched
numGlobalCareGt += numGtCare
numGlobalCareDet += numDetCare
perSampleMetrics = {
'precision': precision,
'recall': recall,
'hmean': hmean,
'pairs': pairs,
'iouMat': [] if len(detPols) > 100 else iouMat.tolist(),
'gtPolPoints': gtPolPoints,
'detPolPoints': detPolPoints,
'gtCare': numGtCare,
'detCare': numDetCare,
'gtDontCare': gtDontCarePolsNum,
'detDontCare': detDontCarePolsNum,
'detMatched': detMatched,
'evaluationLog': evaluationLog
}
return perSampleMetrics
def combine_results(self, results):
numGlobalCareGt = 0
numGlobalCareDet = 0
matchedSum = 0
for result in results:
numGlobalCareGt += result['gtCare']
numGlobalCareDet += result['detCare']
matchedSum += result['detMatched']
methodRecall = 0 if numGlobalCareGt == 0 else float(
matchedSum) / numGlobalCareGt
methodPrecision = 0 if numGlobalCareDet == 0 else float(
matchedSum) / numGlobalCareDet
methodHmean = 0 if methodRecall + methodPrecision == 0 else 2 * \
methodRecall * methodPrecision / (methodRecall + methodPrecision)
# print(methodRecall, methodPrecision, methodHmean)
# sys.exit(-1)
methodMetrics = {
'precision': methodPrecision,
'recall': methodRecall,
'hmean': methodHmean
}
return methodMetrics
if __name__ == '__main__':
evaluator = DetectionIoUEvaluator()
gts = [[{
'points': [(0, 0), (1, 0), (1, 1), (0, 1)],
'text': 1234,
'ignore': False,
}, {
'points': [(2, 2), (3, 2), (3, 3), (2, 3)],
'text': 5678,
'ignore': False,
}]]
preds = [[{
'points': [(0.1, 0.1), (1, 0), (1, 1), (0, 1)],
'text': 123,
'ignore': False,
}]]
results = []
for gt, pred in zip(gts, preds):
results.append(evaluator.evaluate_image(gt, pred))
metrics = evaluator.combine_results(results)
print(metrics)

View File

@ -0,0 +1,131 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
import paddle.fluid as fluid
__all__ = ['eval_det_run']
import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
from ppocr.utils.utility import create_module
from .eval_det_iou import DetectionIoUEvaluator
import json
from copy import deepcopy
import cv2
from ppocr.data.reader_main import reader_main
def cal_det_res(exe, config, eval_info_dict):
global_params = config['Global']
save_res_path = global_params['save_res_path']
postprocess_params = deepcopy(config["PostProcess"])
postprocess_params.update(global_params)
postprocess = create_module(postprocess_params['function']) \
(params=postprocess_params)
with open(save_res_path, "wb") as fout:
tackling_num = 0
for data in eval_info_dict['reader']():
img_num = len(data)
tackling_num = tackling_num + img_num
logger.info("test tackling num:%d", tackling_num)
img_list = []
ratio_list = []
img_name_list = []
for ino in range(img_num):
img_list.append(data[ino][0])
ratio_list.append(data[ino][1])
img_name_list.append(data[ino][2])
img_list = np.concatenate(img_list, axis=0)
outs = exe.run(eval_info_dict['program'], \
feed={'image': img_list}, \
fetch_list=eval_info_dict['fetch_varname_list'])
outs_dict = {}
for tno in range(len(outs)):
fetch_name = eval_info_dict['fetch_name_list'][tno]
fetch_value = np.array(outs[tno])
outs_dict[fetch_name] = fetch_value
dt_boxes_list = postprocess(outs_dict, ratio_list)
for ino in range(img_num):
dt_boxes = dt_boxes_list[ino]
img_name = img_name_list[ino]
dt_boxes_json = []
for box in dt_boxes:
tmp_json = {"transcription": ""}
tmp_json['points'] = box.tolist()
dt_boxes_json.append(tmp_json)
otstr = img_name + "\t" + json.dumps(dt_boxes_json) + "\n"
fout.write(otstr.encode())
return
def load_label_infor(label_file_path, do_ignore=False):
img_name_label_dict = {}
with open(label_file_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
substr = line.decode().strip("\n").split("\t")
bbox_infor = json.loads(substr[1])
bbox_num = len(bbox_infor)
for bno in range(bbox_num):
text = bbox_infor[bno]['transcription']
ignore = False
if text == "###" and do_ignore:
ignore = True
bbox_infor[bno]['ignore'] = ignore
img_name_label_dict[substr[0]] = bbox_infor
return img_name_label_dict
def cal_det_metrics(gt_label_path, save_res_path):
evaluator = DetectionIoUEvaluator()
gt_label_infor = load_label_infor(gt_label_path, do_ignore=True)
dt_label_infor = load_label_infor(save_res_path)
results = []
for img_name in gt_label_infor:
gt_label = gt_label_infor[img_name]
if img_name not in dt_label_infor:
dt_label = []
else:
dt_label = dt_label_infor[img_name]
result = evaluator.evaluate_image(gt_label, dt_label)
results.append(result)
methodMetrics = evaluator.combine_results(results)
return methodMetrics
def eval_det_run(exe, config, eval_info_dict, mode):
cal_det_res(exe, config, eval_info_dict)
save_res_path = config['Global']['save_res_path']
if mode == "eval":
gt_label_path = config['EvalReader']['label_file_path']
metrics = cal_det_metrics(gt_label_path, save_res_path)
else:
gt_label_path = config['TestReader']['label_file_path']
do_eval = config['TestReader']['do_eval']
if do_eval:
metrics = cal_det_metrics(gt_label_path, save_res_path)
else:
metrics = {}
return metrics

View File

@ -0,0 +1,111 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import logging
import numpy as np
import paddle.fluid as fluid
__all__ = ['eval_rec_run', 'test_rec_benchmark']
import logging
FORMAT = '%(asctime)s-%(levelname)s: %(message)s'
logging.basicConfig(level=logging.INFO, format=FORMAT)
logger = logging.getLogger(__name__)
from ppocr.utils.character import cal_predicts_accuracy
from ppocr.utils.character import convert_rec_label_to_lod
from ppocr.utils.character import convert_rec_attention_infer_res
from ppocr.utils.utility import create_module
import json
from copy import deepcopy
import cv2
from ppocr.data.reader_main import reader_main
def eval_rec_run(exe, config, eval_info_dict, mode):
"""
Run evaluation program, return program outputs.
"""
char_ops = config['Global']['char_ops']
total_loss = 0
total_sample_num = 0
total_acc_num = 0
total_batch_num = 0
if mode == "eval":
is_remove_duplicate = False
else:
is_remove_duplicate = True
for data in eval_info_dict['reader']():
img_num = len(data)
img_list = []
label_list = []
for ino in range(img_num):
img_list.append(data[ino][0])
label_list.append(data[ino][1])
img_list = np.concatenate(img_list, axis=0)
outs = exe.run(eval_info_dict['program'], \
feed={'image': img_list}, \
fetch_list=eval_info_dict['fetch_varname_list'], \
return_numpy=False)
preds = np.array(outs[0])
if preds.shape[1] != 1:
preds, preds_lod = convert_rec_attention_infer_res(preds)
else:
preds_lod = outs[0].lod()[0]
labels, labels_lod = convert_rec_label_to_lod(label_list)
acc, acc_num, sample_num = cal_predicts_accuracy(
char_ops, preds, preds_lod, labels, labels_lod, is_remove_duplicate)
total_acc_num += acc_num
total_sample_num += sample_num
total_batch_num += 1
avg_acc = total_acc_num * 1.0 / total_sample_num
metrics = {'avg_acc': avg_acc, "total_acc_num": total_acc_num, \
"total_sample_num": total_sample_num}
return metrics
def test_rec_benchmark(exe, config, eval_info_dict):
" 评估lmdb 数据"
eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', 'IC03_867', \
'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077', 'SVTP', 'CUTE80']
eval_data_dir = config['TestReader']['lmdb_sets_dir']
total_evaluation_data_number = 0
total_correct_number = 0
eval_data_acc_info = {}
for eval_data in eval_data_list:
config['TestReader']['lmdb_sets_dir'] = \
eval_data_dir + "/" + eval_data
eval_reader = reader_main(config=config, mode="test")
eval_info_dict['reader'] = eval_reader
metrics = eval_rec_run(exe, config, eval_info_dict, "test")
total_evaluation_data_number += metrics['total_sample_num']
total_correct_number += metrics['total_acc_num']
eval_data_acc_info[eval_data] = metrics
avg_acc = total_correct_number * 1.0 / total_evaluation_data_number
logger.info('-' * 50)
strs = ""
for eval_data in eval_data_list:
eval_acc = eval_data_acc_info[eval_data]['avg_acc']
strs += "\n {}, accuracy:{:.6f}".format(eval_data, eval_acc)
strs += "\n average, accuracy:{:.6f}".format(avg_acc)
logger.info(strs)
logger.info('-' * 50)

88
tools/export_model.py Normal file
View File

@ -0,0 +1,88 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import multiprocessing
import numpy as np
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
import program
from paddle import fluid
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.save_load import init_model
from ppocr.utils.character import CharacterOps
from ppocr.utils.utility import create_module
def main():
config = program.load_config(FLAGS.config)
program.merge_config(FLAGS.opt)
logger.info(config)
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
program.check_gpu(True)
alg = config['Global']['algorithm']
assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE']
if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']:
config['Global']['char_ops'] = CharacterOps(config['Global'])
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
startup_prog = fluid.Program()
eval_program = fluid.Program()
feeded_var_names, target_vars, fetches_var_name = program.build_export(
config, eval_program, startup_prog)
eval_program = eval_program.clone(for_test=True)
exe = fluid.Executor(place)
exe.run(startup_prog)
init_model(config, eval_program, exe)
fluid.io.save_inference_model(
dirname="./output/",
feeded_var_names=feeded_var_names,
main_program=eval_program,
target_vars=target_vars,
executor=exe,
model_filename='model',
params_filename='params')
print("save success, output_name_list:", fetches_var_name)
if __name__ == '__main__':
parser = program.ArgsParser()
FLAGS = parser.parse_args()
main()

View File

@ -0,0 +1 @@
<paddle.fluid.core_avx.ProgramDesc object at 0x10d15fab0>

169
tools/infer/predict_det.py Executable file
View File

@ -0,0 +1,169 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import utility
from ppocr.utils.utility import initial_logger
logger = initial_logger()
import cv2
from ppocr.data.det.east_process import EASTProcessTest
from ppocr.data.det.db_process import DBProcessTest
from ppocr.postprocess.db_postprocess import DBPostProcess
from ppocr.postprocess.east_postprocess import EASTPostPocess
import copy
import numpy as np
import math
import time
class TextDetector(object):
def __init__(self, args):
max_side_len = args.det_max_side_len
self.det_algorithm = args.det_algorithm
preprocess_params = {'max_side_len': max_side_len}
postprocess_params = {}
if self.det_algorithm == "DB":
self.preprocess_op = DBProcessTest(preprocess_params)
postprocess_params["thresh"] = args.det_db_thresh
postprocess_params["box_thresh"] = args.det_db_box_thresh
postprocess_params["max_candidates"] = 1000
self.postprocess_op = DBPostProcess(postprocess_params)
elif self.det_algorithm == "EAST":
self.preprocess_op = EASTProcessTest(preprocess_params)
postprocess_params["score_thresh"] = args.det_east_score_thresh
postprocess_params["cover_thresh"] = args.det_east_cover_thresh
postprocess_params["nms_thresh"] = args.det_east_nms_thresh
self.postprocess_op = EASTPostPocess(postprocess_params)
else:
logger.info("unknown det_algorithm:{}".format(self.det_algorithm))
sys.exit(0)
self.predictor, self.input_tensor, self.output_tensors =\
utility.create_predictor(args, mode="det")
def order_points_clockwise(self, pts):
#######
## https://github.com/jrosebr1/imutils/blob/master/imutils/perspective.py
########
# sort the points based on their x-coordinates
xSorted = pts[np.argsort(pts[:, 0]), :]
# grab the left-most and right-most points from the sorted
# x-roodinate points
leftMost = xSorted[:2, :]
rightMost = xSorted[2:, :]
# now, sort the left-most coordinates according to their
# y-coordinates so we can grab the top-left and bottom-left
# points, respectively
leftMost = leftMost[np.argsort(leftMost[:, 1]), :]
(tl, bl) = leftMost
rightMost = rightMost[np.argsort(rightMost[:, 1]), :]
(tr, br) = rightMost
rect = np.array([tl, tr, br, bl], dtype="float32")
return rect
def expand_det_res(self, points, bbox_height, bbox_width, img_height,
img_width):
if bbox_height * 1.0 / bbox_width >= 2.0:
expand_w = bbox_width * 0.20
expand_h = bbox_width * 0.20
elif bbox_width * 1.0 / bbox_height >= 3.0:
expand_w = bbox_height * 0.20
expand_h = bbox_height * 0.20
else:
expand_w = bbox_height * 0.1
expand_h = bbox_height * 0.1
points[0, 0] = int(max((points[0, 0] - expand_w), 0))
points[1, 0] = int(min((points[1, 0] + expand_w), img_width))
points[3, 0] = int(max((points[3, 0] - expand_w), 0))
points[2, 0] = int(min((points[2, 0] + expand_w), img_width))
points[0, 1] = int(max((points[0, 1] - expand_h), 0))
points[1, 1] = int(max((points[1, 1] - expand_h), 0))
points[3, 1] = int(min((points[3, 1] + expand_h), img_height))
points[2, 1] = int(min((points[2, 1] + expand_h), img_height))
return points
def filter_tag_det_res(self, dt_boxes, image_shape):
img_height, img_width = image_shape[0:2]
dt_boxes_new = []
for box in dt_boxes:
box = self.order_points_clockwise(box)
left = int(np.min(box[:, 0]))
right = int(np.max(box[:, 0]))
top = int(np.min(box[:, 1]))
bottom = int(np.max(box[:, 1]))
bbox_height = bottom - top
bbox_width = right - left
diffh = math.fabs(box[0, 1] - box[1, 1])
diffw = math.fabs(box[0, 0] - box[3, 0])
rect_width = int(np.linalg.norm(box[0] - box[1]))
rect_height = int(np.linalg.norm(box[0] - box[3]))
if rect_width <= 10 or rect_height <= 10:
continue
if diffh <= 10 and diffw <= 10:
box = self.expand_det_res(
copy.deepcopy(box), bbox_height, bbox_width, img_height,
img_width)
dt_boxes_new.append(box)
dt_boxes = np.array(dt_boxes_new)
return dt_boxes
def __call__(self, img):
ori_im = img.copy()
im, ratio_list = self.preprocess_op(img)
if im is None:
return None, 0
im = im.copy()
starttime = time.time()
self.input_tensor.copy_from_cpu(im)
self.predictor.zero_copy_run()
outputs = []
for output_tensor in self.output_tensors:
output = output_tensor.copy_to_cpu()
outputs.append(output)
outs_dict = {}
if self.det_algorithm == "EAST":
outs_dict['f_score'] = outputs[0]
outs_dict['f_geo'] = outputs[1]
else:
outs_dict['maps'] = [outputs[0]]
dt_boxes_list = self.postprocess_op(outs_dict, [ratio_list])
dt_boxes = dt_boxes_list[0]
dt_boxes = self.filter_tag_det_res(dt_boxes, ori_im.shape)
elapse = time.time() - starttime
return dt_boxes, elapse
if __name__ == "__main__":
args = utility.parse_args()
image_file_list = utility.get_image_file_list(args.image_dir)
text_detector = TextDetector(args)
count = 0
total_time = 0
for image_file in image_file_list:
img = cv2.imread(image_file)
if img is None:
logger.info("error in loading image:{}".format(image_file))
continue
dt_boxes, elapse = text_detector(img)
if count > 0:
total_time += elapse
count += 1
print("Predict time of %s:" % image_file, elapse)
utility.draw_text_det_res(dt_boxes, image_file)
print("Avg Time:", total_time / (count - 1))

76
tools/infer/predict_eval.py Executable file
View File

@ -0,0 +1,76 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import utility
from ppocr.utils.utility import initial_logger
logger = initial_logger()
import cv2
import predict_system
import copy
import numpy as np
import math
import time
import json
if __name__ == "__main__":
args = utility.parse_args()
text_sys = predict_system.TextSystem(args)
image_file_list = []
label_file_path = "./eval_perform/gt_res/test_chinese_ic15_500_4pts.txt"
img_set_path = "./eval_perform/"
with open(label_file_path, "rb") as fin:
lines = fin.readlines()
for line in lines:
substr = line.decode('utf-8').strip("\n").split("\t")
if "lsvt" in substr[0]:
continue
image_file_list.append(substr[0])
total_time_all = 0
count = 0
save_path = "./output/predict.txt"
fout = open(save_path, "wb")
for image_name in image_file_list:
image_file = img_set_path + image_name
img = cv2.imread(image_file)
if img is None:
logger.info("error in loading image:{}".format(image_file))
continue
count += 1
total_time = 0
starttime = time.time()
dt_boxes, rec_res = text_sys(img)
elapse = time.time() - starttime
total_time_all += elapse
print("Predict time of %s(%d): %.3fs" % (image_file, count, elapse))
dt_num = len(dt_boxes)
bbox_list = []
for dno in range(dt_num):
box = dt_boxes[dno]
text, score = rec_res[dno]
points = []
for tno in range(len(box)):
points.append([box[tno][0] * 1.0, box[tno][1] * 1.0])
bbox_list.append({
"transcription": text,
"points": points,
"scores": score * 1.0
})
otstr = image_name + "\t" + json.dumps(bbox_list) + "\n"
fout.write(otstr.encode('utf-8'))
avg_time = total_time_all / count
logger.info("avg_time: {0}".format(avg_time))
logger.info("avg_fps: {0}".format(1.0 / avg_time))
fout.close()

72
tools/infer/predict_eval_new.py Executable file
View File

@ -0,0 +1,72 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import utility
from ppocr.utils.utility import initial_logger
logger = initial_logger()
import cv2
import predict_system
import copy
import numpy as np
import math
import time
import json
import os
if __name__ == "__main__":
args = utility.parse_args()
text_sys = predict_system.TextSystem(args)
image_file_list = []
img_set_path = "/paddle/code/dyn/test_imgs/rctw_samples/"
image_file_list = os.listdir(img_set_path)
total_time_all = 0
count = 0
save_path = "./output/predict.txt"
fout = open(save_path, "wb")
for image_name in image_file_list:
image_file = img_set_path + image_name
img = cv2.imread(image_file)
if img is None:
logger.info("error in loading image:{}".format(image_file))
continue
count += 1
starttime = time.time()
dt_boxes, rec_res = text_sys(img)
if dt_boxes is None:
count -= 1
continue
elapse = time.time() - starttime
total_time_all += elapse
print("Predict time of %s(%d): %.3fs" % (image_file, count, elapse))
dt_num = len(dt_boxes)
bbox_list = []
for dno in range(dt_num):
box = dt_boxes[dno]
text, score = rec_res[dno]
points = []
for tno in range(len(box)):
points.append([box[tno][0] * 1.0, box[tno][1] * 1.0])
bbox_list.append({
"transcription": text,
"points": points,
"scores": score * 1.0
})
otstr = image_name + "\t" + json.dumps(bbox_list) + "\n"
fout.write(otstr.encode('utf-8'))
avg_time = total_time_all / count
logger.info("avg_time: {0}".format(avg_time))
logger.info("avg_fps: {0}".format(1.0 / avg_time))
fout.close()

115
tools/infer/predict_rec.py Executable file
View File

@ -0,0 +1,115 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import utility
from ppocr.utils.utility import initial_logger
logger = initial_logger()
import cv2
import copy
import numpy as np
import math
import time
from ppocr.utils.character import CharacterOps
class TextRecognizer(object):
def __init__(self, args):
self.predictor, self.input_tensor, self.output_tensors =\
utility.create_predictor(args, mode="rec")
image_shape = [int(v) for v in args.rec_image_shape.split(",")]
self.rec_image_shape = image_shape
char_ops_params = {}
char_ops_params["character_type"] = args.rec_char_type
char_ops_params["character_dict_path"] = args.rec_char_dict_path
char_ops_params['loss_type'] = 'ctc'
self.char_ops = CharacterOps(char_ops_params)
def resize_norm_img(self, img):
imgC, imgH, imgW = self.rec_image_shape
h = img.shape[0]
w = img.shape[1]
ratio = w / float(h)
if math.ceil(imgH * ratio) > imgW:
resized_w = imgW
else:
resized_w = int(math.ceil(imgH * ratio))
resized_image = cv2.resize(img, (resized_w, imgH))
resized_image = resized_image.astype('float32')
resized_image = resized_image.transpose((2, 0, 1)) / 255
resized_image -= 0.5
resized_image /= 0.5
padding_im = np.zeros((imgC, imgH, imgW), dtype=np.float32)
padding_im[:, :, 0:resized_w] = resized_image
return padding_im
def __call__(self, img_list):
img_num = len(img_list)
batch_num = 15
rec_res = []
predict_time = 0
for beg_img_no in range(0, img_num, batch_num):
end_img_no = min(img_num, beg_img_no + batch_num)
norm_img_batch = []
for ino in range(beg_img_no, end_img_no):
norm_img = self.resize_norm_img(img_list[ino])
norm_img = norm_img[np.newaxis, :]
norm_img_batch.append(norm_img)
norm_img_batch = np.concatenate(norm_img_batch)
norm_img_batch = norm_img_batch.copy()
starttime = time.time()
self.input_tensor.copy_from_cpu(norm_img_batch)
self.predictor.zero_copy_run()
rec_idx_batch = self.output_tensors[0].copy_to_cpu()
rec_idx_lod = self.output_tensors[0].lod()[0]
predict_batch = self.output_tensors[1].copy_to_cpu()
predict_lod = self.output_tensors[1].lod()[0]
elapse = time.time() - starttime
predict_time += elapse
starttime = time.time()
for rno in range(len(rec_idx_lod) - 1):
beg = rec_idx_lod[rno]
end = rec_idx_lod[rno + 1]
rec_idx_tmp = rec_idx_batch[beg:end, 0]
preds_text = self.char_ops.decode(rec_idx_tmp)
beg = predict_lod[rno]
end = predict_lod[rno + 1]
probs = predict_batch[beg:end, :]
ind = np.argmax(probs, axis=1)
blank = probs.shape[1]
valid_ind = np.where(ind != (blank - 1))[0]
score = np.mean(probs[valid_ind, ind[valid_ind]])
rec_res.append([preds_text, score])
return rec_res, predict_time
if __name__ == "__main__":
args = utility.parse_args()
image_file_list = utility.get_image_file_list(args.image_dir)
text_recognizer = TextRecognizer(args)
valid_image_file_list = []
img_list = []
for image_file in image_file_list:
img = cv2.imread(image_file)
if img is None:
logger.info("error in loading image:{}".format(image_file))
continue
valid_image_file_list.append(image_file)
img_list.append(img)
rec_res, predict_time = text_recognizer(img_list)
rec_res, predict_time = text_recognizer(img_list)
for ino in range(len(img_list)):
print("Predicts of %s:%s" % (valid_image_file_list[ino], rec_res[ino]))
print("Total predict time for %d images:%.3f" %
(len(img_list), predict_time))

97
tools/infer/predict_system.py Executable file
View File

@ -0,0 +1,97 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import utility
from ppocr.utils.utility import initial_logger
logger = initial_logger()
import cv2
import predict_det
import predict_rec
import copy
import numpy as np
import math
import time
class TextSystem(object):
def __init__(self, args):
self.text_detector = predict_det.TextDetector(args)
self.text_recognizer = predict_rec.TextRecognizer(args)
def get_rotate_crop_image(self, img, points):
img_height, img_width = img.shape[0:2]
left = int(np.min(points[:, 0]))
right = int(np.max(points[:, 0]))
top = int(np.min(points[:, 1]))
bottom = int(np.max(points[:, 1]))
img_crop = img[top:bottom, left:right, :].copy()
points[:, 0] = points[:, 0] - left
points[:, 1] = points[:, 1] - top
img_crop_width = int(np.linalg.norm(points[0] - points[1]))
img_crop_height = int(np.linalg.norm(points[0] - points[3]))
pts_std = np.float32([[0, 0], [img_crop_width, 0],\
[img_crop_width, img_crop_height], [0, img_crop_height]])
M = cv2.getPerspectiveTransform(points, pts_std)
dst_img = cv2.warpPerspective(
img_crop,
M, (img_crop_width, img_crop_height),
borderMode=cv2.BORDER_REPLICATE)
dst_img_height, dst_img_width = dst_img.shape[0:2]
if dst_img_height * 1.0 / dst_img_width >= 1.5:
dst_img = np.rot90(dst_img)
return dst_img
def print_draw_crop_rec_res(self, img_crop_list, rec_res):
bbox_num = len(img_crop_list)
for bno in range(bbox_num):
cv2.imwrite("./output/img_crop_%d.jpg" % bno, img_crop_list[bno])
print(bno, rec_res[bno])
def __call__(self, img):
ori_im = img.copy()
dt_boxes, elapse = self.text_detector(img)
if dt_boxes is None:
return None, None
img_crop_list = []
for bno in range(len(dt_boxes)):
tmp_box = copy.deepcopy(dt_boxes[bno])
img_crop = self.get_rotate_crop_image(ori_im, tmp_box)
img_crop_list.append(img_crop)
rec_res, elapse = self.text_recognizer(img_crop_list)
# self.print_draw_crop_rec_res(img_crop_list, rec_res)
return dt_boxes, rec_res
if __name__ == "__main__":
args = utility.parse_args()
image_file_list = utility.get_image_file_list(args.image_dir)
text_sys = TextSystem(args)
for image_file in image_file_list:
img = cv2.imread(image_file)
if img is None:
logger.info("error in loading image:{}".format(image_file))
continue
starttime = time.time()
dt_boxes, rec_res = text_sys(img)
elapse = time.time() - starttime
print("Predict time of %s: %.3fs" % (image_file, elapse))
dt_num = len(dt_boxes)
dt_boxes_final = []
for dno in range(dt_num):
text, score = rec_res[dno]
if score >= 0:
text_str = "%s, %.3f" % (text, score)
print(text_str)
dt_boxes_final.append(dt_boxes[dno])
utility.draw_text_det_res(dt_boxes_final, image_file)

147
tools/infer/utility.py Executable file
View File

@ -0,0 +1,147 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
import argparse
import os, sys
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from paddle.fluid.core import PaddleTensor
from paddle.fluid.core import AnalysisConfig
from paddle.fluid.core import create_paddle_predictor
import cv2
import numpy as np
def parse_args():
def str2bool(v):
return v.lower() in ("true", "t", "1")
parser = argparse.ArgumentParser()
#params for prediction engine
parser.add_argument("--use_gpu", type=str2bool, default=True)
parser.add_argument("--ir_optim", type=str2bool, default=True)
parser.add_argument("--use_tensorrt", type=str2bool, default=False)
parser.add_argument("--gpu_mem", type=int, default=8000)
#params for text detector
parser.add_argument("--image_dir", type=str)
parser.add_argument("--det_algorithm", type=str, default='DB')
parser.add_argument("--det_model_dir", type=str)
parser.add_argument("--det_max_side_len", type=float, default=960)
#DB parmas
parser.add_argument("--det_db_thresh", type=float, default=0.3)
parser.add_argument("--det_db_box_thresh", type=float, default=0.5)
#EAST parmas
parser.add_argument("--det_east_score_thresh", type=float, default=0.8)
parser.add_argument("--det_east_cover_thresh", type=float, default=0.1)
parser.add_argument("--det_east_nms_thresh", type=float, default=0.2)
#params for text recognizer
parser.add_argument("--rec_algorithm", type=str, default='CRNN')
parser.add_argument("--rec_model_dir", type=str)
parser.add_argument("--rec_image_shape", type=str, default="3, 32, 320")
parser.add_argument("--rec_char_type", type=str, default='ch')
parser.add_argument(
"--rec_char_dict_path",
type=str,
default="./ppocr/utils/ppocr_keys_v1.txt")
return parser.parse_args()
def get_image_file_list(image_dir):
image_file_list = []
if image_dir is None:
return image_file_list
if os.path.isfile(image_dir):
image_file_list = [image_dir]
elif os.path.isdir(image_dir):
for single_file in os.listdir(image_dir):
image_file_list.append(os.path.join(image_dir, single_file))
return image_file_list
def create_predictor(args, mode):
if mode == "det":
model_dir = args.det_model_dir
else:
model_dir = args.rec_model_dir
if model_dir is None:
logger.info("not find {} model file path {}".format(mode, model_dir))
sys.exit(0)
model_file_path = model_dir + "/model"
params_file_path = model_dir + "/params"
if not os.path.exists(model_file_path):
logger.info("not find model file path {}".format(model_file_path))
sys.exit(0)
if not os.path.exists(params_file_path):
logger.info("not find params file path {}".format(params_file_path))
sys.exit(0)
config = AnalysisConfig(model_file_path, params_file_path)
if args.use_gpu:
config.enable_use_gpu(args.gpu_mem, 0)
else:
config.disable_gpu()
config.disable_glog_info()
config.switch_ir_optim(args.ir_optim)
# if args.use_tensorrt:
# config.enable_tensorrt_engine(
# precision_mode=AnalysisConfig.Precision.Half
# if args.use_fp16 else AnalysisConfig.Precision.Float32,
# max_batch_size=args.batch_size)
config.enable_memory_optim()
# use zero copy
config.switch_use_feed_fetch_ops(False)
predictor = create_paddle_predictor(config)
input_names = predictor.get_input_names()
input_tensor = predictor.get_input_tensor(input_names[0])
output_names = predictor.get_output_names()
output_tensors = []
for output_name in output_names:
output_tensor = predictor.get_output_tensor(output_name)
output_tensors.append(output_tensor)
return predictor, input_tensor, output_tensors
def draw_text_det_res(dt_boxes, img_path):
src_im = cv2.imread(img_path)
for box in dt_boxes:
box = np.array(box).astype(np.int32).reshape(-1, 2)
cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
img_name_pure = img_path.split("/")[-1]
cv2.imwrite("./output/%s" % img_name_pure, src_im)
if __name__ == '__main__':
args = parse_args()
args.use_gpu = False
root_path = "/Users/liuweiwei06/Desktop/TEST_CODES/icode/baidu/personal-code/PaddleOCR/"
args.det_model_dir = root_path + "test_models/public_v1/ch_det_mv3_db"
predictor, input_tensor, output_tensors = create_predictor(args, mode='det')
print(predictor.get_input_names())
print(predictor.get_output_names())
print(predictor.program(), file=open("det_program.txt", 'w'))
args.rec_model_dir = root_path + "test_models/public_v1/ch_rec_mv3_crnn/"
rec_predictor, input_tensor, output_tensors = create_predictor(
args, mode='rec')
print(rec_predictor.get_input_names())
print(rec_predictor.get_output_names())

125
tools/infer_rec.py Executable file
View File

@ -0,0 +1,125 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import multiprocessing
import numpy as np
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
from paddle import fluid
# from ppocr.utils.utility import load_config, merge_config
from ppocr.data.reader_main import test_reader
import program
from paddle import fluid
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.data.reader_main import reader_main
from ppocr.utils.save_load import init_model
from ppocr.utils.character import CharacterOps
from ppocr.utils.utility import create_module
logger = initial_logger()
def main():
config = program.load_config(FLAGS.config)
program.merge_config(FLAGS.opt)
logger.info(config)
char_ops = CharacterOps(config['Global'])
config['Global']['char_ops'] = char_ops
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
# check_gpu(use_gpu)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
rec_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program()
eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog):
with fluid.unique_name.guard():
_, outputs = rec_model(mode="test")
fetch_name_list = list(outputs.keys())
fetch_varname_list = [outputs[v].name for v in fetch_name_list]
eval_prog = eval_prog.clone(for_test=True)
exe.run(startup_prog)
init_model(config, eval_prog, exe)
blobs = reader_main(config, 'test')
imgs = next(blobs())
for img in imgs:
predict = exe.run(program=eval_prog,
feed={"image": img},
fetch_list=fetch_varname_list,
return_numpy=False)
preds = np.array(predict[0])
if preds.shape[1] == 1:
preds = preds.reshape(-1)
preds_lod = predict[0].lod()[0]
preds_text = char_ops.decode(preds)
else:
end_pos = np.where(preds[0, :] == 1)[0]
if len(end_pos) <= 1:
preds_text = preds[0, 1:]
else:
preds_text = preds[0, 1:end_pos[1]]
preds_text = preds_text.reshape(-1)
preds_text = char_ops.decode(preds_text)
print(preds)
print(preds_text)
# save for inference model
target_var = []
for key, values in outputs.items():
target_var.append(values)
fluid.io.save_inference_model(
"./output/",
feeded_var_names=['image'],
target_vars=target_var,
executor=exe,
main_program=eval_prog,
model_filename="model",
params_filename="params")
if __name__ == '__main__':
parser = program.ArgsParser()
FLAGS = parser.parse_args()
main()

365
tools/program.py Executable file
View File

@ -0,0 +1,365 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
from argparse import ArgumentParser, RawDescriptionHelpFormatter
import sys
import yaml
import os
from ppocr.utils.utility import create_module
from ppocr.utils.utility import initial_logger
logger = initial_logger()
import paddle.fluid as fluid
import time
from ppocr.utils.stats import TrainingStats
from eval_utils.eval_det_utils import eval_det_run
from eval_utils.eval_rec_utils import eval_rec_run
from ppocr.utils.save_load import save_model
import numpy as np
from ppocr.utils.character import cal_predicts_accuracy
class ArgsParser(ArgumentParser):
def __init__(self):
super(ArgsParser, self).__init__(
formatter_class=RawDescriptionHelpFormatter)
self.add_argument("-c", "--config", help="configuration file to use")
self.add_argument(
"-o", "--opt", nargs='+', help="set configuration options")
def parse_args(self, argv=None):
args = super(ArgsParser, self).parse_args(argv)
assert args.config is not None, \
"Please specify --config=configure_file_path."
args.opt = self._parse_opt(args.opt)
return args
def _parse_opt(self, opts):
config = {}
if not opts:
return config
for s in opts:
s = s.strip()
k, v = s.split('=')
config[k] = yaml.load(v, Loader=yaml.Loader)
return config
class AttrDict(dict):
"""Single level attribute dict, NOT recursive"""
def __init__(self, **kwargs):
super(AttrDict, self).__init__()
super(AttrDict, self).update(kwargs)
def __getattr__(self, key):
if key in self:
return self[key]
raise AttributeError("object has no attribute '{}'".format(key))
global_config = AttrDict()
def load_config(file_path):
"""
Load config from yml/yaml file.
Args:
file_path (str): Path of the config file to be loaded.
Returns: global config
"""
_, ext = os.path.splitext(file_path)
assert ext in ['.yml', '.yaml'], "only support yaml files for now"
merge_config(yaml.load(open(file_path), Loader=yaml.Loader))
assert "reader_yml" in global_config['Global'],\
"absence reader_yml in global"
reader_file_path = global_config['Global']['reader_yml']
_, ext = os.path.splitext(reader_file_path)
assert ext in ['.yml', '.yaml'], "only support yaml files for reader"
merge_config(yaml.load(open(reader_file_path), Loader=yaml.Loader))
return global_config
def merge_config(config):
"""
Merge config into global config.
Args:
config (dict): Config to be merged.
Returns: global config
"""
for key, value in config.items():
if "." not in key:
if isinstance(value, dict) and key in global_config:
global_config[key].update(value)
else:
global_config[key] = value
else:
sub_keys = key.split('.')
assert (sub_keys[0] in global_config)
cur = global_config[sub_keys[0]]
for idx, sub_key in enumerate(sub_keys[1:]):
assert (sub_key in cur)
if idx == len(sub_keys) - 2:
cur[sub_key] = value
else:
cur = cur[sub_key]
def check_gpu(use_gpu):
"""
Log error and exit when set use_gpu=true in paddlepaddle
cpu version.
"""
err = "Config use_gpu cannot be set as true while you are " \
"using paddlepaddle cpu version ! \nPlease try: \n" \
"\t1. Install paddlepaddle-gpu to run model on GPU \n" \
"\t2. Set use_gpu as false in config file to run " \
"model on CPU"
try:
if use_gpu and not fluid.is_compiled_with_cuda():
logger.error(err)
sys.exit(1)
except Exception as e:
pass
def build(config, main_prog, startup_prog, mode):
"""
Build a program using a model and an optimizer
1. create feeds
2. create a dataloader
3. create a model
4. create fetchs
5. create an optimizer
Args:
config(dict): config
main_prog(): main program
startup_prog(): startup program
is_train(bool): train or valid
Returns:
dataloader(): a bridge between the model and the data
fetchs(dict): dict of model outputs(included loss and measures)
"""
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
func_infor = config['Architecture']['function']
model = create_module(func_infor)(params=config)
dataloader, outputs = model(mode=mode)
fetch_name_list = list(outputs.keys())
fetch_varname_list = [outputs[v].name for v in fetch_name_list]
opt_loss_name = None
if mode == "train":
opt_loss = outputs['total_loss']
opt_params = config['Optimizer']
optimizer = create_module(opt_params['function'])(opt_params)
optimizer.minimize(opt_loss)
opt_loss_name = opt_loss.name
global_lr = optimizer._global_learning_rate()
global_lr.persistable = True
fetch_name_list.insert(0, "lr")
fetch_varname_list.insert(0, global_lr.name)
return (dataloader, fetch_name_list, fetch_varname_list, opt_loss_name)
def build_export(config, main_prog, startup_prog):
"""
Build a program using a model and an optimizer
1. create feeds
2. create a dataloader
3. create a model
4. create fetchs
5. create an optimizer
Args:
config(dict): config
main_prog(): main program
startup_prog(): startup program
is_train(bool): train or valid
Returns:
dataloader(): a bridge between the model and the data
fetchs(dict): dict of model outputs(included loss and measures)
"""
with fluid.program_guard(main_prog, startup_prog):
with fluid.unique_name.guard():
func_infor = config['Architecture']['function']
model = create_module(func_infor)(params=config)
image, outputs = model(mode='export')
fetches_var = [outputs[name] for name in outputs]
fetches_var_name = [name for name in outputs]
feeded_var_names = [image.name]
target_vars = fetches_var
return feeded_var_names, target_vars, fetches_var_name
def create_multi_devices_program(program, loss_var_name):
build_strategy = fluid.BuildStrategy()
build_strategy.memory_optimize = False
build_strategy.enable_inplace = True
exec_strategy = fluid.ExecutionStrategy()
exec_strategy.num_iteration_per_drop_scope = 1
compile_program = fluid.CompiledProgram(program).with_data_parallel(
loss_name=loss_var_name,
build_strategy=build_strategy,
exec_strategy=exec_strategy)
return compile_program
def train_eval_det_run(config, exe, train_info_dict, eval_info_dict):
train_batch_id = 0
log_smooth_window = config['Global']['log_smooth_window']
epoch_num = config['Global']['epoch_num']
print_batch_step = config['Global']['print_batch_step']
eval_batch_step = config['Global']['eval_batch_step']
save_epoch_step = config['Global']['save_epoch_step']
save_model_dir = config['Global']['save_model_dir']
train_stats = TrainingStats(log_smooth_window,
train_info_dict['fetch_name_list'])
best_eval_hmean = -1
best_batch_id = 0
best_epoch = 0
train_loader = train_info_dict['reader']
for epoch in range(epoch_num):
train_loader.start()
try:
while True:
t1 = time.time()
train_outs = exe.run(
program=train_info_dict['compile_program'],
fetch_list=train_info_dict['fetch_varname_list'],
return_numpy=False)
stats = {}
for tno in range(len(train_outs)):
fetch_name = train_info_dict['fetch_name_list'][tno]
fetch_value = np.mean(np.array(train_outs[tno]))
stats[fetch_name] = fetch_value
t2 = time.time()
train_batch_elapse = t2 - t1
train_stats.update(stats)
if train_batch_id > 0 and train_batch_id \
% print_batch_step == 0:
logs = train_stats.log()
strs = 'epoch: {}, iter: {}, {}, time: {:.3f}'.format(
epoch, train_batch_id, logs, train_batch_elapse)
logger.info(strs)
if train_batch_id > 0 and\
train_batch_id % eval_batch_step == 0:
metrics = eval_det_run(exe, config, eval_info_dict, "eval")
hmean = metrics['hmean']
if hmean >= best_eval_hmean:
best_eval_hmean = hmean
best_batch_id = train_batch_id
best_epoch = epoch
save_path = save_model_dir + "/best_accuracy"
save_model(train_info_dict['train_program'], save_path)
strs = 'Test iter: {}, metrics:{}, best_hmean:{:.6f}, best_epoch:{}, best_batch_id:{}'.format(
train_batch_id, metrics, best_eval_hmean, best_epoch,
best_batch_id)
logger.info(strs)
train_batch_id += 1
except fluid.core.EOFException:
train_loader.reset()
if epoch > 0 and epoch % save_epoch_step == 0:
save_path = save_model_dir + "/iter_epoch_%d" % (epoch)
save_model(train_info_dict['train_program'], save_path)
return
def train_eval_rec_run(config, exe, train_info_dict, eval_info_dict):
train_batch_id = 0
log_smooth_window = config['Global']['log_smooth_window']
epoch_num = config['Global']['epoch_num']
print_batch_step = config['Global']['print_batch_step']
eval_batch_step = config['Global']['eval_batch_step']
save_epoch_step = config['Global']['save_epoch_step']
save_model_dir = config['Global']['save_model_dir']
train_stats = TrainingStats(log_smooth_window, ['loss', 'acc'])
best_eval_acc = -1
best_batch_id = 0
best_epoch = 0
train_loader = train_info_dict['reader']
for epoch in range(epoch_num):
train_loader.start()
try:
while True:
t1 = time.time()
train_outs = exe.run(
program=train_info_dict['compile_program'],
fetch_list=train_info_dict['fetch_varname_list'],
return_numpy=False)
fetch_map = dict(
zip(train_info_dict['fetch_name_list'],
range(len(train_outs))))
loss = np.mean(np.array(train_outs[fetch_map['total_loss']]))
lr = np.mean(np.array(train_outs[fetch_map['lr']]))
preds_idx = fetch_map['decoded_out']
preds = np.array(train_outs[preds_idx])
preds_lod = train_outs[preds_idx].lod()[0]
labels_idx = fetch_map['label']
labels = np.array(train_outs[labels_idx])
labels_lod = train_outs[labels_idx].lod()[0]
acc, acc_num, img_num = cal_predicts_accuracy(
config['Global']['char_ops'], preds, preds_lod, labels,
labels_lod)
t2 = time.time()
train_batch_elapse = t2 - t1
stats = {'loss': loss, 'acc': acc}
train_stats.update(stats)
if train_batch_id > 0 and train_batch_id \
% print_batch_step == 0:
logs = train_stats.log()
strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format(
epoch, train_batch_id, lr, logs, train_batch_elapse)
logger.info(strs)
if train_batch_id > 0 and\
train_batch_id % eval_batch_step == 0:
metrics = eval_rec_run(exe, config, eval_info_dict, "eval")
eval_acc = metrics['avg_acc']
eval_sample_num = metrics['total_sample_num']
if eval_acc > best_eval_acc:
best_eval_acc = eval_acc
best_batch_id = train_batch_id
best_epoch = epoch
save_path = save_model_dir + "/best_accuracy"
save_model(train_info_dict['train_program'], save_path)
strs = 'Test iter: {}, acc:{:.6f}, best_acc:{:.6f}, best_epoch:{}, best_batch_id:{}, eval_sample_num:{}'.format(
train_batch_id, eval_acc, best_eval_acc, best_epoch,
best_batch_id, eval_sample_num)
logger.info(strs)
train_batch_id += 1
except fluid.core.EOFException:
train_loader.reset()
if epoch > 0 and epoch % save_epoch_step == 0:
save_path = save_model_dir + "/iter_epoch_%d" % (epoch)
save_model(train_info_dict['train_program'], save_path)
return

134
tools/tmp/eval_det.py Executable file
View File

@ -0,0 +1,134 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import numpy as np
from copy import deepcopy
import json
# from paddle.fluid.contrib.model_stat import summary
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import load_config, merge_config
import ppocr.data.det.reader_main as reader
from ppocr.utils.utility import ArgsParser
from ppocr.utils.check import check_gpu
from ppocr.utils.checkpoint import load_pretrain, load_checkpoint, save, save_model
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.eval_utils import eval_det_run
def draw_det_res(dt_boxes, config, img_name, ino):
if len(dt_boxes) > 0:
img_set_path = config['TestReader']['img_set_dir']
img_path = img_set_path + img_name
import cv2
src_im = cv2.imread(img_path)
for box in dt_boxes:
box = box.astype(np.int32).reshape((-1, 1, 2))
cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
cv2.imwrite("tmp%d.jpg" % ino, src_im)
def main():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
print(config)
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
check_gpu(use_gpu)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
det_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program()
eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog):
with fluid.unique_name.guard():
eval_loader, eval_outputs = det_model(mode="test")
eval_fetch_list = [v.name for v in eval_outputs]
eval_prog = eval_prog.clone(for_test=True)
exe.run(startup_prog)
pretrain_weights = config['Global']['pretrain_weights']
if pretrain_weights is not None:
load_pretrain(exe, eval_prog, pretrain_weights)
# fluid.load(eval_prog, pretrain_weights)
# def if_exist(var):
# return os.path.exists(os.path.join(pretrain_weights, var.name))
# fluid.io.load_vars(exe, pretrain_weights, predicate=if_exist, main_program=eval_prog)
else:
logger.info("Not find pretrain_weights:%s" % pretrain_weights)
sys.exit(0)
# fluid.io.save_inference_model("./output/", feeded_var_names=['image'],
# target_vars=eval_outputs, executor=exe, main_program=eval_prog,
# model_filename="model", params_filename="params")
# sys.exit(-1)
metrics = eval_det_run(exe, eval_prog, eval_fetch_list, config, "test")
logger.info("metrics:{}".format(metrics))
logger.info("success!")
def test_reader():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
print(config)
tmp_reader = reader.test_reader(config=config)
count = 0
print_count = 0
import time
starttime = time.time()
for data in tmp_reader():
count += len(data)
print_count += 1
if print_count % 10 == 0:
batch_time = (time.time() - starttime) / print_count
print("reader:", count, len(data), batch_time)
print("finish reader:", count)
print("success")
if __name__ == '__main__':
parser = ArgsParser()
FLAGS = parser.parse_args()
main()
# test_reader()

160
tools/tmp/infer_det.py Executable file
View File

@ -0,0 +1,160 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import numpy as np
from copy import deepcopy
import json
# from paddle.fluid.contrib.model_stat import summary
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import load_config, merge_config
import ppocr.data.det.reader_main as reader
from ppocr.utils.utility import ArgsParser
from ppocr.utils.check import check_gpu
from ppocr.utils.checkpoint import load_pretrain, load_checkpoint, save, save_model
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.eval_utils import eval_det_run
def draw_det_res(dt_boxes, config, img_name, ino):
if len(dt_boxes) > 0:
img_set_path = config['TestReader']['img_set_dir']
img_path = img_set_path + img_name
import cv2
src_im = cv2.imread(img_path)
for box in dt_boxes:
box = box.astype(np.int32).reshape((-1, 1, 2))
cv2.polylines(src_im, [box], True, color=(255, 255, 0), thickness=2)
cv2.imwrite("tmp%d.jpg" % ino, src_im)
def main():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
print(config)
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
check_gpu(use_gpu)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
det_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program()
eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog):
with fluid.unique_name.guard():
eval_outputs = det_model(mode="test")
eval_fetch_list = [v.name for v in eval_outputs]
eval_prog = eval_prog.clone(for_test=True)
exe.run(startup_prog)
pretrain_weights = config['Global']['pretrain_weights']
if pretrain_weights is not None:
fluid.load(eval_prog, pretrain_weights)
else:
logger.info("Not find pretrain_weights:%s" % pretrain_weights)
sys.exit(0)
save_res_path = config['Global']['save_res_path']
with open(save_res_path, "wb") as fout:
test_reader = reader.test_reader(config=config)
tackling_num = 0
for data in test_reader():
img_num = len(data)
tackling_num = tackling_num + img_num
logger.info("tackling_num:%d", tackling_num)
img_list = []
ratio_list = []
img_name_list = []
for ino in range(img_num):
img_list.append(data[ino][0])
ratio_list.append(data[ino][1])
img_name_list.append(data[ino][2])
img_list = np.concatenate(img_list, axis=0)
outs = exe.run(eval_prog,\
feed={'image': img_list},\
fetch_list=eval_fetch_list)
global_params = config['Global']
postprocess_params = deepcopy(config["PostProcess"])
postprocess_params.update(global_params)
postprocess = create_module(postprocess_params['function'])\
(params=postprocess_params)
dt_boxes_list = postprocess(outs, ratio_list)
for ino in range(img_num):
dt_boxes = dt_boxes_list[ino]
img_name = img_name_list[ino]
dt_boxes_json = []
for box in dt_boxes:
tmp_json = {"transcription": ""}
tmp_json['points'] = box.tolist()
dt_boxes_json.append(tmp_json)
otstr = img_name + "\t" + json.dumps(dt_boxes_json) + "\n"
fout.write(otstr.encode())
#draw_det_res(dt_boxes, config, img_name, ino)
logger.info("success!")
def test_reader():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
print(config)
tmp_reader = reader.test_reader(config=config)
count = 0
print_count = 0
import time
starttime = time.time()
for data in tmp_reader():
count += len(data)
print_count += 1
if print_count % 10 == 0:
batch_time = (time.time() - starttime) / print_count
print("reader:", count, len(data), batch_time)
print("finish reader:", count)
print("success")
if __name__ == '__main__':
parser = ArgsParser()
FLAGS = parser.parse_args()
main()
# test_reader()

116
tools/tmp/infer_rec.py Executable file
View File

@ -0,0 +1,116 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import multiprocessing
import numpy as np
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
from paddle import fluid
from ppocr.utils.utility import load_config, merge_config
from ppocr.data.rec.reader_main import test_reader
from ppocr.utils.utility import ArgsParser
from ppocr.utils.character import CharacterOps, cal_predicts_accuracy
from ppocr.utils.check import check_gpu
from ppocr.utils.utility import create_module
from ppocr.utils.utility import initial_logger
logger = initial_logger()
def main():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
char_ops = CharacterOps(config['Global'])
config['Global']['char_num'] = char_ops.get_char_num()
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
check_gpu(use_gpu)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
rec_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program()
eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog):
with fluid.unique_name.guard():
eval_outputs = rec_model(mode="test")
eval_fetch_list = [v.name for v in eval_outputs]
eval_prog = eval_prog.clone(for_test=True)
exe.run(startup_prog)
pretrain_weights = config['Global']['pretrain_weights']
if pretrain_weights is not None:
fluid.load(eval_prog, pretrain_weights)
test_img_path = config['test_img_path']
image_shape = config['Global']['image_shape']
blobs = test_reader(image_shape, test_img_path)
predict = exe.run(program=eval_prog,
feed={"image": blobs},
fetch_list=eval_fetch_list,
return_numpy=False)
preds = np.array(predict[0])
if preds.shape[1] == 1:
preds = preds.reshape(-1)
preds_lod = predict[0].lod()[0]
preds_text = char_ops.decode(preds)
else:
end_pos = np.where(preds[0, :] == 1)[0]
if len(end_pos) <= 1:
preds_text = preds[0, 1:]
else:
preds_text = preds[0, 1:end_pos[1]]
preds_text = preds_text.reshape(-1)
preds_text = char_ops.decode(preds_text)
fluid.io.save_inference_model(
"./output/",
feeded_var_names=['image'],
target_vars=eval_outputs,
executor=exe,
main_program=eval_prog,
model_filename="model",
params_filename="params")
print(preds)
print(preds_text)
if __name__ == '__main__':
parser = ArgsParser()
FLAGS = parser.parse_args()
main()

128
tools/tmp/test_rec_benchmark.py Executable file
View File

@ -0,0 +1,128 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import time
import multiprocessing
import numpy as np
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
from paddle import fluid
from ppocr.utils.utility import load_config, merge_config
import ppocr.data.rec.reader_main as reader
from ppocr.utils.utility import ArgsParser
from ppocr.utils.character import CharacterOps, cal_predicts_accuracy
from ppocr.utils.check import check_gpu
from ppocr.utils.utility import create_module
from ppocr.utils.eval_utils import eval_run
from ppocr.utils.utility import initial_logger
logger = initial_logger()
def main():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
char_ops = CharacterOps(config['Global'])
config['Global']['char_num'] = char_ops.get_char_num()
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
check_gpu(use_gpu)
if use_gpu:
devices_num = fluid.core.get_cuda_device_count()
else:
devices_num = int(
os.environ.get('CPU_NUM', multiprocessing.cpu_count()))
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
rec_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program()
eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog):
with fluid.unique_name.guard():
eval_loader, eval_outputs = rec_model(mode="eval")
eval_fetch_list = [v.name for v in eval_outputs]
eval_prog = eval_prog.clone(for_test=True)
exe.run(startup_prog)
pretrain_weights = config['Global']['pretrain_weights']
if pretrain_weights is not None:
fluid.load(eval_prog, pretrain_weights)
eval_data_list = ['IIIT5k_3000', 'SVT', 'IC03_860', 'IC03_867',\
'IC13_857', 'IC13_1015', 'IC15_1811', 'IC15_2077', 'SVTP', 'CUTE80']
eval_data_dir = config['TestReader']['lmdb_sets_dir']
total_forward_time = 0
total_evaluation_data_number = 0
total_correct_number = 0
eval_data_acc_info = {}
for eval_data in eval_data_list:
config['TestReader']['lmdb_sets_dir'] = \
eval_data_dir + "/" + eval_data
eval_reader = reader.train_eval_reader(
config=config, char_ops=char_ops, mode="test")
eval_loader.set_sample_list_generator(eval_reader, places=place)
start_time = time.time()
outs = eval_run(exe, eval_prog, eval_loader, eval_fetch_list, char_ops,
"best", "test")
infer_time = time.time() - start_time
eval_acc, acc_num, sample_num = outs
total_forward_time += infer_time
total_evaluation_data_number += sample_num
total_correct_number += acc_num
eval_data_acc_info[eval_data] = outs
avg_forward_time = total_forward_time / total_evaluation_data_number
avg_acc = total_correct_number * 1.0 / total_evaluation_data_number
logger.info('-' * 50)
strs = ""
for eval_data in eval_data_list:
eval_acc, acc_num, sample_num = eval_data_acc_info[eval_data]
strs += "\n {}, accuracy:{:.6f}".format(eval_data, eval_acc)
strs += "\n average, accuracy:{:.6f}, time:{:.6f}".format(avg_acc,
avg_forward_time)
logger.info(strs)
logger.info('-' * 50)
if __name__ == '__main__':
parser = ArgsParser()
FLAGS = parser.parse_args()
main()

216
tools/tmp/train_det.py Executable file
View File

@ -0,0 +1,216 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import multiprocessing
import numpy as np
# from paddle.fluid.contrib.model_stat import summary
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import load_config, merge_config
import ppocr.data.det.reader_main as reader
from ppocr.utils.utility import ArgsParser
from ppocr.utils.character import CharacterOps, cal_predicts_accuracy
from ppocr.utils.check import check_gpu
from ppocr.utils.stats import TrainingStats
from ppocr.utils.checkpoint import load_pretrain, load_checkpoint, save, save_model
from ppocr.utils.eval_utils import eval_run
from ppocr.utils.eval_utils import eval_det_run
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.utility import create_multi_devices_program
def main():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
print(config)
alg = config['Global']['algorithm']
assert alg in ['EAST', 'DB']
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
check_gpu(use_gpu)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
det_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program()
train_prog = fluid.Program()
with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard():
train_loader, train_outputs = det_model(mode="train")
train_fetch_list = [v.name for v in train_outputs]
train_loss = train_outputs[0]
opt_params = config['Optimizer']
optimizer = create_module(opt_params['function'])(opt_params)
optimizer.minimize(train_loss)
global_lr = optimizer._global_learning_rate()
global_lr.persistable = True
train_fetch_list.append(global_lr.name)
eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog):
with fluid.unique_name.guard():
eval_loader, eval_outputs = det_model(mode="eval")
eval_fetch_list = [v.name for v in eval_outputs]
eval_prog = eval_prog.clone(for_test=True)
train_reader = reader.train_reader(config=config)
train_loader.set_sample_list_generator(train_reader, places=place)
exe.run(startup_prog)
# compile program for multi-devices
train_compile_program = create_multi_devices_program(train_prog,
train_loss.name)
pretrain_weights = config['Global']['pretrain_weights']
if pretrain_weights is not None:
load_pretrain(exe, train_prog, pretrain_weights)
print("pretrain weights loaded!")
train_batch_id = 0
if alg == 'EAST':
train_log_keys = ['loss_total', 'loss_cls', 'loss_offset']
elif alg == 'DB':
train_log_keys = [
'loss_total', 'loss_shrink', 'loss_threshold', 'loss_binary'
]
log_smooth_window = config['Global']['log_smooth_window']
epoch_num = config['Global']['epoch_num']
print_step = config['Global']['print_step']
eval_step = config['Global']['eval_step']
save_epoch_step = config['Global']['save_epoch_step']
save_dir = config['Global']['save_dir']
train_stats = TrainingStats(log_smooth_window, train_log_keys)
best_eval_hmean = -1
best_batch_id = 0
best_epoch = 0
for epoch in range(epoch_num):
train_loader.start()
try:
while True:
t1 = time.time()
train_outs = exe.run(program=train_compile_program,
fetch_list=train_fetch_list,
return_numpy=False)
loss_total = np.mean(np.array(train_outs[0]))
if alg == 'EAST':
loss_cls = np.mean(np.array(train_outs[1]))
loss_offset = np.mean(np.array(train_outs[2]))
stats = {'loss_total':loss_total, 'loss_cls':loss_cls,\
'loss_offset':loss_offset}
elif alg == 'DB':
loss_shrink_maps = np.mean(np.array(train_outs[1]))
loss_threshold_maps = np.mean(np.array(train_outs[2]))
loss_binary_maps = np.mean(np.array(train_outs[3]))
stats = {'loss_total':loss_total, 'loss_shrink':loss_shrink_maps, \
'loss_threshold':loss_threshold_maps, 'loss_binary':loss_binary_maps}
lr = np.mean(np.array(train_outs[-1]))
t2 = time.time()
train_batch_elapse = t2 - t1
# stats = {'loss_total':loss_total, 'loss_cls':loss_cls,\
# 'loss_offset':loss_offset}
train_stats.update(stats)
if train_batch_id > 0 and train_batch_id % print_step == 0:
logs = train_stats.log()
strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format(
epoch, train_batch_id, lr, logs, train_batch_elapse)
logger.info(strs)
if train_batch_id > 0 and\
train_batch_id % eval_step == 0:
metrics = eval_det_run(exe, eval_prog, eval_fetch_list,
config, "eval")
hmean = metrics['hmean']
if hmean >= best_eval_hmean:
best_eval_hmean = hmean
best_batch_id = train_batch_id
best_epoch = epoch
save_path = save_dir + "/best_accuracy"
save_model(train_prog, save_path)
strs = 'Test iter: {}, metrics:{}, best_hmean:{:.6f}, best_epoch:{}, best_batch_id:{}'.format(
train_batch_id, metrics, best_eval_hmean, best_epoch,
best_batch_id)
logger.info(strs)
train_batch_id += 1
except fluid.core.EOFException:
train_loader.reset()
if epoch > 0 and epoch % save_epoch_step == 0:
save_path = save_dir + "/iter_epoch_%d" % (epoch)
save_model(train_prog, save_path)
def test_reader():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
print(config)
tmp_reader = reader.train_reader(config=config)
count = 0
print_count = 0
import time
while True:
starttime = time.time()
count = 0
for data in tmp_reader():
count += 1
if print_count % 1 == 0:
batch_time = time.time() - starttime
starttime = time.time()
print("reader:", count, len(data), batch_time)
print("finish reader:", count)
print("success")
if __name__ == '__main__':
parser = ArgsParser()
parser.add_argument(
"-r",
"--resume_checkpoint",
default=None,
type=str,
help="Checkpoint path for resuming training.")
FLAGS = parser.parse_args()
main()
# test_reader()

222
tools/tmp/train_rec.py Executable file
View File

@ -0,0 +1,222 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import multiprocessing
import numpy as np
# from paddle.fluid.contrib.model_stat import summary
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
from paddle import fluid
from ppocr.utils.utility import create_module
from ppocr.utils.utility import load_config, merge_config
import ppocr.data.rec.reader_main as reader
from ppocr.utils.utility import ArgsParser
from ppocr.utils.character import CharacterOps, cal_predicts_accuracy
from ppocr.utils.check import check_gpu
from ppocr.utils.stats import TrainingStats
from ppocr.utils.checkpoint import load_pretrain, load_checkpoint, save, save_model
from ppocr.utils.eval_utils import eval_run
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.utils.utility import create_multi_devices_program
def main():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
char_ops = CharacterOps(config['Global'])
config['Global']['char_num'] = char_ops.get_char_num()
print(config)
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
check_gpu(use_gpu)
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
exe = fluid.Executor(place)
rec_model = create_module(config['Architecture']['function'])(params=config)
startup_prog = fluid.Program()
train_prog = fluid.Program()
with fluid.program_guard(train_prog, startup_prog):
with fluid.unique_name.guard():
train_loader, train_outputs = rec_model(mode="train")
save_var = train_outputs[1]
if "gradient_clip" in config['Global']:
gradient_clip = config['Global']['gradient_clip']
clip = fluid.clip.GradientClipByGlobalNorm(gradient_clip)
fluid.clip.set_gradient_clip(clip, program=train_prog)
train_fetch_list = [v.name for v in train_outputs]
train_loss = train_outputs[0]
opt_params = config['Optimizer']
optimizer = create_module(opt_params['function'])(opt_params)
optimizer.minimize(train_loss)
global_lr = optimizer._global_learning_rate()
global_lr.persistable = True
train_fetch_list.append(global_lr.name)
train_reader = reader.train_eval_reader(
config=config, char_ops=char_ops, mode="train")
train_loader.set_sample_list_generator(train_reader, places=place)
eval_prog = fluid.Program()
with fluid.program_guard(eval_prog, startup_prog):
with fluid.unique_name.guard():
eval_loader, eval_outputs = rec_model(mode="eval")
eval_fetch_list = [v.name for v in eval_outputs]
eval_prog = eval_prog.clone(for_test=True)
exe.run(startup_prog)
eval_reader = reader.train_eval_reader(
config=config, char_ops=char_ops, mode="eval")
eval_loader.set_sample_list_generator(eval_reader, places=place)
# compile program for multi-devices
train_compile_program = create_multi_devices_program(train_prog,
train_loss.name)
pretrain_weights = config['Global']['pretrain_weights']
if pretrain_weights is not None:
load_pretrain(exe, train_prog, pretrain_weights)
train_batch_id = 0
train_log_keys = ['loss', 'acc']
log_smooth_window = config['Global']['log_smooth_window']
epoch_num = config['Global']['epoch_num']
loss_type = config['Global']['loss_type']
print_step = config['Global']['print_step']
eval_step = config['Global']['eval_step']
save_epoch_step = config['Global']['save_epoch_step']
save_dir = config['Global']['save_dir']
train_stats = TrainingStats(log_smooth_window, train_log_keys)
best_eval_acc = -1
best_batch_id = 0
best_epoch = 0
for epoch in range(epoch_num):
train_loader.start()
try:
while True:
t1 = time.time()
train_outs = exe.run(program=train_compile_program,
fetch_list=train_fetch_list,
return_numpy=False)
loss = np.mean(np.array(train_outs[0]))
lr = np.mean(np.array(train_outs[-1]))
preds = np.array(train_outs[1])
preds_lod = train_outs[1].lod()[0]
labels = np.array(train_outs[2])
labels_lod = train_outs[2].lod()[0]
acc, acc_num, img_num = cal_predicts_accuracy(
char_ops, preds, preds_lod, labels, labels_lod)
t2 = time.time()
train_batch_elapse = t2 - t1
stats = {'loss': loss, 'acc': acc}
train_stats.update(stats)
if train_batch_id > 0 and train_batch_id % print_step == 0:
logs = train_stats.log()
strs = 'epoch: {}, iter: {}, lr: {:.6f}, {}, time: {:.3f}'.format(
epoch, train_batch_id, lr, logs, train_batch_elapse)
logger.info(strs)
if train_batch_id > 0 and train_batch_id % eval_step == 0:
outs = eval_run(exe, eval_prog, eval_loader,
eval_fetch_list, char_ops, train_batch_id,
"eval")
eval_acc, acc_num, sample_num = outs
if eval_acc > best_eval_acc:
best_eval_acc = eval_acc
best_batch_id = train_batch_id
best_epoch = epoch
save_path = save_dir + "/best_accuracy"
save_model(train_prog, save_path)
strs = 'Test iter: {}, acc:{:.6f}, best_acc:{:.6f}, best_epoch:{}, best_batch_id:{}, sample_num:{}'.format(
train_batch_id, eval_acc, best_eval_acc, best_epoch,
best_batch_id, sample_num)
logger.info(strs)
train_batch_id += 1
except fluid.core.EOFException:
train_loader.reset()
if epoch > 0 and epoch % save_epoch_step == 0:
save_path = save_dir + "/iter_epoch_%d" % (epoch)
save_model(train_prog, save_path)
def test_reader():
config = load_config(FLAGS.config)
merge_config(FLAGS.opt)
char_ops = CharacterOps(config['Global'])
config['Global']['char_num'] = char_ops.get_char_num()
print(config)
# tmp_reader = reader.train_eval_reader(
# config=cfg, char_ops=char_ops, mode="train")
tmp_reader = reader.train_eval_reader(
config=config, char_ops=char_ops, mode="eval")
count = 0
print_count = 0
import time
starttime = time.time()
for data in tmp_reader():
count += len(data)
print_count += 1
if print_count % 10 == 0:
batch_time = (time.time() - starttime) / print_count
print("reader:", count, len(data), batch_time)
print("finish reader:", count)
print("success")
if __name__ == '__main__':
parser = ArgsParser()
parser.add_argument(
"-r",
"--resume_checkpoint",
default=None,
type=str,
help="Checkpoint path for resuming training.")
FLAGS = parser.parse_args()
main()
# test_reader()

113
tools/train.py Executable file
View File

@ -0,0 +1,113 @@
# Copyright (c) 2020 PaddlePaddle Authors. All Rights Reserved.
#
# Licensed under the Apache License, Version 2.0 (the "License");
# you may not use this file except in compliance with the License.
# You may obtain a copy of the License at
#
# http://www.apache.org/licenses/LICENSE-2.0
#
# Unless required by applicable law or agreed to in writing, software
# distributed under the License is distributed on an "AS IS" BASIS,
# WITHOUT WARRANTIES OR CONDITIONS OF ANY KIND, either express or implied.
# See the License for the specific language governing permissions and
# limitations under the License.
from __future__ import absolute_import
from __future__ import division
from __future__ import print_function
import os
import sys
import time
import multiprocessing
import numpy as np
def set_paddle_flags(**kwargs):
for key, value in kwargs.items():
if os.environ.get(key, None) is None:
os.environ[key] = str(value)
# NOTE(paddle-dev): All of these flags should be
# set before `import paddle`. Otherwise, it would
# not take any effect.
set_paddle_flags(
FLAGS_eager_delete_tensor_gb=0, # enable GC to save memory
)
import program
from paddle import fluid
from ppocr.utils.utility import initial_logger
logger = initial_logger()
from ppocr.data.reader_main import reader_main
from ppocr.utils.save_load import init_model
from ppocr.utils.character import CharacterOps
def main():
config = program.load_config(FLAGS.config)
program.merge_config(FLAGS.opt)
logger.info(config)
# check if set use_gpu=True in paddlepaddle cpu version
use_gpu = config['Global']['use_gpu']
program.check_gpu(True)
alg = config['Global']['algorithm']
assert alg in ['EAST', 'DB', 'Rosetta', 'CRNN', 'STARNet', 'RARE']
if alg in ['Rosetta', 'CRNN', 'STARNet', 'RARE']:
config['Global']['char_ops'] = CharacterOps(config['Global'])
place = fluid.CUDAPlace(0) if use_gpu else fluid.CPUPlace()
startup_program = fluid.Program()
train_program = fluid.Program()
train_build_outputs = program.build(
config, train_program, startup_program, mode='train')
train_loader = train_build_outputs[0]
train_fetch_name_list = train_build_outputs[1]
train_fetch_varname_list = train_build_outputs[2]
train_opt_loss_name = train_build_outputs[3]
eval_program = fluid.Program()
eval_build_outputs = program.build(
config, eval_program, startup_program, mode='eval')
eval_fetch_name_list = eval_build_outputs[1]
eval_fetch_varname_list = eval_build_outputs[2]
eval_program = eval_program.clone(for_test=True)
train_reader = reader_main(config=config, mode="train")
train_loader.set_sample_list_generator(train_reader, places=place)
eval_reader = reader_main(config=config, mode="eval")
exe = fluid.Executor(place)
exe.run(startup_program)
# compile program for multi-devices
train_compile_program = program.create_multi_devices_program(
train_program, train_opt_loss_name)
init_model(config, train_program, exe)
train_info_dict = {'compile_program':train_compile_program,\
'train_program':train_program,\
'reader':train_loader,\
'fetch_name_list':train_fetch_name_list,\
'fetch_varname_list':train_fetch_varname_list}
eval_info_dict = {'program':eval_program,\
'reader':eval_reader,\
'fetch_name_list':eval_fetch_name_list,\
'fetch_varname_list':eval_fetch_varname_list}
if alg in ['EAST', 'DB']:
program.train_eval_det_run(config, exe, train_info_dict, eval_info_dict)
else:
program.train_eval_rec_run(config, exe, train_info_dict, eval_info_dict)
if __name__ == '__main__':
parser = program.ArgsParser()
FLAGS = parser.parse_args()
main()
# test_reader()