SengTak commited on
Commit
212a4f1
·
1 Parent(s): 45fa7cb

duplicate from OCR

Browse files
This view is limited to 50 files because it contains too many changes.   See raw diff
Files changed (50) hide show
  1. Home.py +17 -0
  2. README.md +2 -1
  3. __pycache__/multipage.cpython-37.pyc +0 -0
  4. app_pages/.DS_Store +0 -0
  5. app_pages/__pycache__/about.cpython-37.pyc +0 -0
  6. app_pages/__pycache__/home.cpython-37.pyc +0 -0
  7. app_pages/__pycache__/ocr_comparator.cpython-37.pyc +0 -0
  8. app_pages/about.py +37 -0
  9. app_pages/home.py +19 -0
  10. app_pages/img_demo_1.jpg +0 -0
  11. app_pages/img_demo_2.jpg +0 -0
  12. app_pages/ocr.png +0 -0
  13. app_pages/ocr_comparator.py +1421 -0
  14. configs/_base_/default_runtime.py +17 -0
  15. configs/_base_/det_datasets/ctw1500.py +18 -0
  16. configs/_base_/det_datasets/icdar2015.py +18 -0
  17. configs/_base_/det_datasets/icdar2017.py +18 -0
  18. configs/_base_/det_datasets/synthtext.py +18 -0
  19. configs/_base_/det_datasets/toy_data.py +41 -0
  20. configs/_base_/det_models/dbnet_r18_fpnc.py +21 -0
  21. configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py +23 -0
  22. configs/_base_/det_models/dbnetpp_r50dcnv2_fpnc.py +28 -0
  23. configs/_base_/det_models/drrg_r50_fpn_unet.py +21 -0
  24. configs/_base_/det_models/fcenet_r50_fpn.py +33 -0
  25. configs/_base_/det_models/fcenet_r50dcnv2_fpn.py +35 -0
  26. configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py +126 -0
  27. configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py +126 -0
  28. configs/_base_/det_models/panet_r18_fpem_ffm.py +43 -0
  29. configs/_base_/det_models/panet_r50_fpem_ffm.py +21 -0
  30. configs/_base_/det_models/psenet_r50_fpnf.py +51 -0
  31. configs/_base_/det_models/textsnake_r50_fpn_unet.py +22 -0
  32. configs/_base_/det_pipelines/dbnet_pipeline.py +88 -0
  33. configs/_base_/det_pipelines/drrg_pipeline.py +60 -0
  34. configs/_base_/det_pipelines/fcenet_pipeline.py +118 -0
  35. configs/_base_/det_pipelines/maskrcnn_pipeline.py +57 -0
  36. configs/_base_/det_pipelines/panet_pipeline.py +156 -0
  37. configs/_base_/det_pipelines/psenet_pipeline.py +70 -0
  38. configs/_base_/det_pipelines/textsnake_pipeline.py +65 -0
  39. configs/_base_/recog_datasets/MJ_train.py +21 -0
  40. configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py +31 -0
  41. configs/_base_/recog_datasets/ST_MJ_train.py +29 -0
  42. configs/_base_/recog_datasets/ST_SA_MJ_real_train.py +81 -0
  43. configs/_base_/recog_datasets/ST_SA_MJ_train.py +48 -0
  44. configs/_base_/recog_datasets/ST_charbox_train.py +23 -0
  45. configs/_base_/recog_datasets/academic_test.py +57 -0
  46. configs/_base_/recog_datasets/seg_toy_data.py +34 -0
  47. configs/_base_/recog_datasets/toy_data.py +54 -0
  48. configs/_base_/recog_models/abinet.py +70 -0
  49. configs/_base_/recog_models/crnn.py +12 -0
  50. configs/_base_/recog_models/crnn_tps.py +18 -0
Home.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+ from multipage import MultiPage
3
+ from app_pages import home, about, ocr_comparator
4
+
5
+ app = MultiPage()
6
+ st.set_page_config(
7
+ page_title='OCR Comparator', layout ="wide",
8
+ initial_sidebar_state="expanded",
9
+ )
10
+
11
+ # Add all your application here
12
+ app.add_page("Home", "house", home.app)
13
+ app.add_page("About", "info-circle", about.app)
14
+ app.add_page("App", "cast", ocr_comparator.app)
15
+
16
+ # The main app
17
+ app.run()
README.md CHANGED
@@ -5,7 +5,8 @@ colorFrom: purple
5
  colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
- app_file: app.py
 
9
  pinned: false
10
  ---
11
 
 
5
  colorTo: green
6
  sdk: streamlit
7
  sdk_version: 1.27.2
8
+ app_file: Home.py
9
+ tags: [streamlit, ocr]
10
  pinned: false
11
  ---
12
 
__pycache__/multipage.cpython-37.pyc ADDED
Binary file (2.65 kB). View file
 
app_pages/.DS_Store ADDED
Binary file (6.15 kB). View file
 
app_pages/__pycache__/about.cpython-37.pyc ADDED
Binary file (2.02 kB). View file
 
app_pages/__pycache__/home.cpython-37.pyc ADDED
Binary file (889 Bytes). View file
 
app_pages/__pycache__/ocr_comparator.cpython-37.pyc ADDED
Binary file (48.1 kB). View file
 
app_pages/about.py ADDED
@@ -0,0 +1,37 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def app():
4
+ st.title("OCR solutions comparator")
5
+
6
+ st.write("")
7
+ st.write("")
8
+ st.write("")
9
+
10
+ st.markdown("##### This app allows you to compare, from a given picture, the results of different solutions:")
11
+ st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
12
+ st.write("")
13
+ st.write("")
14
+
15
+ st.markdown(''' The 1st step is to choose the language for the text recognition (not all solutions \
16
+ support the same languages), and then choose the picture to consider. It is possible to upload a file, \
17
+ to take a picture, or to use a demo file. \
18
+ It is then possible to change the default values for the text area detection process, \
19
+ before launching the detection task for each solution.''')
20
+ st.write("")
21
+
22
+ st.markdown(''' The different results are then presented. The 2nd step is to choose one of these \
23
+ detection results, in order to carry out the text recognition process there. It is also possible to change \
24
+ the default settings for each solution.''')
25
+ st.write("")
26
+
27
+ st.markdown("###### The recognition results appear in 2 formats:")
28
+ st.markdown(''' - a visual format resumes the initial image, replacing the detected areas with \
29
+ the recognized text. The background is + or - strongly colored in green according to the \
30
+ confidence level of the recognition.
31
+ A slider allows you to change the font size, another \
32
+ allows you to modify the confidence threshold above which the text color changes: if it is at \
33
+ 70% for example, then all the texts with a confidence threshold higher or equal to 70 will appear \
34
+ in white, in black otherwise.''')
35
+
36
+ st.markdown(" - a detailed format presents the results in a table, for each text box detected. \
37
+ It is possible to download this results in a local csv file.")
app_pages/home.py ADDED
@@ -0,0 +1,19 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ import streamlit as st
2
+
3
+ def app():
4
+ st.image('ocr.png')
5
+
6
+ st.write("")
7
+
8
+ st.markdown('''#### OCR, or Optical Character Recognition, is a computer vision task, \
9
+ which includes the detection of text areas, and the recognition of characters.''')
10
+ st.write("")
11
+ st.write("")
12
+
13
+ st.markdown("##### This app allows you to compare, from a given image, the results of different solutions:")
14
+ st.markdown("##### *EasyOcr, PaddleOCR, MMOCR, Tesseract*")
15
+ st.write("")
16
+ st.write("")
17
+ st.markdown("👈 Select the **About** page from the sidebar for information on how the app works")
18
+
19
+ st.markdown("👈 or directly select the **App** page")
app_pages/img_demo_1.jpg ADDED
app_pages/img_demo_2.jpg ADDED
app_pages/ocr.png ADDED
app_pages/ocr_comparator.py ADDED
@@ -0,0 +1,1421 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ """This Streamlit app allows you to compare, from a given image, the results of different solutions:
2
+ EasyOcr, PaddleOCR, MMOCR, Tesseract
3
+ """
4
+ import streamlit as st
5
+ import plotly.express as px
6
+ import numpy as np
7
+ import math
8
+ import pandas as pd
9
+ from time import sleep
10
+
11
+ import cv2
12
+ from PIL import Image, ImageColor
13
+ import PIL
14
+ import easyocr
15
+ from paddleocr import PaddleOCR
16
+ from mmocr.utils.ocr import MMOCR
17
+ import pytesseract
18
+ from pytesseract import Output
19
+ import os
20
+ from mycolorpy import colorlist as mcp
21
+
22
+
23
+ ###################################################################################################
24
+ ## MAIN
25
+ ###################################################################################################
26
+ def app():
27
+
28
+ ###################################################################################################
29
+ ## FUNCTIONS
30
+ ###################################################################################################
31
+
32
+ @st.cache
33
+ def convert_df(in_df):
34
+ """Convert data frame function, used by download button
35
+
36
+ Args:
37
+ in_df (data frame): data frame to convert
38
+
39
+ Returns:
40
+ data frame: converted data frame
41
+ """
42
+ # IMPORTANT: Cache the conversion to prevent computation on every rerun
43
+ return in_df.to_csv().encode('utf-8')
44
+
45
+ ###
46
+ def easyocr_coord_convert(in_list_coord):
47
+ """Convert easyocr coordinates to standard format used by others functions
48
+
49
+ Args:
50
+ in_list_coord (list of numbers): format [x_min, x_max, y_min, y_max]
51
+
52
+ Returns:
53
+ list of lists: format [ [x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max] ]
54
+ """
55
+
56
+ coord = in_list_coord
57
+ return [[coord[0], coord[2]], [coord[1], coord[2]], [coord[1], coord[3]], [coord[0], coord[3]]]
58
+
59
+ ###
60
+ @st.cache(show_spinner=False)
61
+ def initializations():
62
+ """Initializations for the app
63
+
64
+ Returns:
65
+ list of strings : list of OCR solutions names
66
+ (['EasyOCR', 'PPOCR', 'MMOCR', 'Tesseract'])
67
+ dict : names and indices of the OCR solutions
68
+ ({'EasyOCR': 0, 'PPOCR': 1, 'MMOCR': 2, 'Tesseract': 3})
69
+ list of dicts : list of languages supported by each OCR solution
70
+ list of int : columns for recognition details results
71
+ dict : confidence color scale
72
+ plotly figure : confidence color scale figure
73
+ """
74
+ # the readers considered
75
+ out_reader_type_list = ['EasyOCR', 'PPOCR', 'MMOCR', 'Tesseract']
76
+ out_reader_type_dict = {'EasyOCR': 0, 'PPOCR': 1, 'MMOCR': 2, 'Tesseract': 3}
77
+
78
+ # Columns for recognition details results
79
+ out_cols_size = [2] + [2,1]*(len(out_reader_type_list)-1) # Except Tesseract
80
+
81
+ # Dicts of laguages supported by each reader
82
+ out_dict_lang_easyocr = {'Abaza': 'abq', 'Adyghe': 'ady', 'Afrikaans': 'af', 'Angika': 'ang', \
83
+ 'Arabic': 'ar', 'Assamese': 'as', 'Avar': 'ava', 'Azerbaijani': 'az', 'Belarusian': 'be', \
84
+ 'Bulgarian': 'bg', 'Bihari': 'bh', 'Bhojpuri': 'bho', 'Bengali': 'bn', 'Bosnian': 'bs', \
85
+ 'Simplified Chinese': 'ch_sim', 'Traditional Chinese': 'ch_tra', 'Chechen': 'che', \
86
+ 'Czech': 'cs', 'Welsh': 'cy', 'Danish': 'da', 'Dargwa': 'dar', 'German': 'de', \
87
+ 'English': 'en', 'Spanish': 'es', 'Estonian': 'et', 'Persian (Farsi)': 'fa', 'French': 'fr', \
88
+ 'Irish': 'ga', 'Goan Konkani': 'gom', 'Hindi': 'hi', 'Croatian': 'hr', 'Hungarian': 'hu', \
89
+ 'Indonesian': 'id', 'Ingush': 'inh', 'Icelandic': 'is', 'Italian': 'it', 'Japanese': 'ja', \
90
+ 'Kabardian': 'kbd', 'Kannada': 'kn', 'Korean': 'ko', 'Kurdish': 'ku', 'Latin': 'la', \
91
+ 'Lak': 'lbe', 'Lezghian': 'lez', 'Lithuanian': 'lt', 'Latvian': 'lv', 'Magahi': 'mah', \
92
+ 'Maithili': 'mai', 'Maori': 'mi', 'Mongolian': 'mn', 'Marathi': 'mr', 'Malay': 'ms', \
93
+ 'Maltese': 'mt', 'Nepali': 'ne', 'Newari': 'new', 'Dutch': 'nl', 'Norwegian': 'no', \
94
+ 'Occitan': 'oc', 'Pali': 'pi', 'Polish': 'pl', 'Portuguese': 'pt', 'Romanian': 'ro', \
95
+ 'Russian': 'ru', 'Serbian (cyrillic)': 'rs_cyrillic', 'Serbian (latin)': 'rs_latin', \
96
+ 'Nagpuri': 'sck', 'Slovak': 'sk', 'Slovenian': 'sl', 'Albanian': 'sq', 'Swedish': 'sv', \
97
+ 'Swahili': 'sw', 'Tamil': 'ta', 'Tabassaran': 'tab', 'Telugu': 'te', 'Thai': 'th', \
98
+ 'Tajik': 'tjk', 'Tagalog': 'tl', 'Turkish': 'tr', 'Uyghur': 'ug', 'Ukranian': 'uk', \
99
+ 'Urdu': 'ur', 'Uzbek': 'uz', 'Vietnamese': 'vi'}
100
+
101
+ out_dict_lang_ppocr = {'Abaza': 'abq', 'Adyghe': 'ady', 'Afrikaans': 'af', 'Albanian': 'sq', \
102
+ 'Angika': 'ang', 'Arabic': 'ar', 'Avar': 'ava', 'Azerbaijani': 'az', 'Belarusian': 'be', \
103
+ 'Bhojpuri': 'bho','Bihari': 'bh','Bosnian': 'bs','Bulgarian': 'bg','Chinese & English': 'ch', \
104
+ 'Chinese Traditional': 'chinese_cht', 'Croatian': 'hr', 'Czech': 'cs', 'Danish': 'da', \
105
+ 'Dargwa': 'dar', 'Dutch': 'nl', 'English': 'en', 'Estonian': 'et', 'French': 'fr', \
106
+ 'German': 'german','Goan Konkani': 'gom','Hindi': 'hi','Hungarian': 'hu','Icelandic': 'is', \
107
+ 'Indonesian': 'id', 'Ingush': 'inh', 'Irish': 'ga', 'Italian': 'it', 'Japan': 'japan', \
108
+ 'Kabardian': 'kbd', 'Korean': 'korean', 'Kurdish': 'ku', 'Lak': 'lbe', 'Latvian': 'lv', \
109
+ 'Lezghian': 'lez', 'Lithuanian': 'lt', 'Magahi': 'mah', 'Maithili': 'mai', 'Malay': 'ms', \
110
+ 'Maltese': 'mt', 'Maori': 'mi', 'Marathi': 'mr', 'Mongolian': 'mn', 'Nagpur': 'sck', \
111
+ 'Nepali': 'ne', 'Newari': 'new', 'Norwegian': 'no', 'Occitan': 'oc', 'Persian': 'fa', \
112
+ 'Polish': 'pl', 'Portuguese': 'pt', 'Romanian': 'ro', 'Russia': 'ru', 'Saudi Arabia': 'sa', \
113
+ 'Serbian(cyrillic)': 'rs_cyrillic', 'Serbian(latin)': 'rs_latin', 'Slovak': 'sk', \
114
+ 'Slovenian': 'sl', 'Spanish': 'es', 'Swahili': 'sw', 'Swedish': 'sv', 'Tabassaran': 'tab', \
115
+ 'Tagalog': 'tl', 'Tamil': 'ta', 'Telugu': 'te', 'Turkish': 'tr', 'Ukranian': 'uk', \
116
+ 'Urdu': 'ur', 'Uyghur': 'ug', 'Uzbek': 'uz', 'Vietnamese': 'vi', 'Welsh': 'cy'}
117
+
118
+ out_dict_lang_mmocr = {'English & Chinese': 'en'}
119
+
120
+ out_dict_lang_tesseract = {'Afrikaans': 'afr','Albanian': 'sqi','Amharic': 'amh', \
121
+ 'Arabic': 'ara', 'Armenian': 'hye','Assamese': 'asm','Azerbaijani - Cyrilic': 'aze_cyrl', \
122
+ 'Azerbaijani': 'aze', 'Basque': 'eus','Belarusian': 'bel','Bengali': 'ben','Bosnian': 'bos', \
123
+ 'Breton': 'bre', 'Bulgarian': 'bul','Burmese': 'mya','Catalan; Valencian': 'cat', \
124
+ 'Cebuano': 'ceb', 'Central Khmer': 'khm','Cherokee': 'chr','Chinese - Simplified': 'chi_sim', \
125
+ 'Chinese - Traditional': 'chi_tra','Corsican': 'cos','Croatian': 'hrv','Czech': 'ces', \
126
+ 'Danish':'dan','Dutch; Flemish':'nld','Dzongkha':'dzo','English, Middle (1100-1500)':'enm', \
127
+ 'English': 'eng','Esperanto': 'epo','Estonian': 'est','Faroese': 'fao', \
128
+ 'Filipino (old - Tagalog)': 'fil','Finnish': 'fin','French, Middle (ca.1400-1600)': 'frm', \
129
+ 'French': 'fra','Galician': 'glg','Georgian - Old': 'kat_old','Georgian': 'kat', \
130
+ 'German - Fraktur': 'frk','German': 'deu','Greek, Modern (1453-)': 'ell','Gujarati': 'guj', \
131
+ 'Haitian; Haitian Creole': 'hat','Hebrew': 'heb','Hindi': 'hin','Hungarian': 'hun', \
132
+ 'Icelandic': 'isl','Indonesian': 'ind','Inuktitut': 'iku','Irish': 'gle', \
133
+ 'Italian - Old': 'ita_old','Italian': 'ita','Japanese': 'jpn','Javanese': 'jav', \
134
+ 'Kannada': 'kan','Kazakh': 'kaz','Kirghiz; Kyrgyz': 'kir','Korean (vertical)': 'kor_vert', \
135
+ 'Korean': 'kor','Kurdish (Arabic Script)': 'kur_ara','Lao': 'lao','Latin': 'lat', \
136
+ 'Latvian':'lav','Lithuanian':'lit','Luxembourgish':'ltz','Macedonian':'mkd','Malay':'msa', \
137
+ 'Malayalam': 'mal','Maltese': 'mlt','Maori': 'mri','Marathi': 'mar','Mongolian': 'mon', \
138
+ 'Nepali': 'nep','Norwegian': 'nor','Occitan (post 1500)': 'oci', \
139
+ 'Orientation and script detection module':'osd','Oriya':'ori','Panjabi; Punjabi':'pan', \
140
+ 'Persian':'fas','Polish':'pol','Portuguese':'por','Pushto; Pashto':'pus','Quechua':'que', \
141
+ 'Romanian; Moldavian; Moldovan': 'ron','Russian': 'rus','Sanskrit': 'san', \
142
+ 'Scottish Gaelic': 'gla','Serbian - Latin': 'srp_latn','Serbian': 'srp','Sindhi': 'snd', \
143
+ 'Sinhala; Sinhalese': 'sin','Slovak': 'slk','Slovenian': 'slv', \
144
+ 'Spanish; Castilian - Old': 'spa_old','Spanish; Castilian': 'spa','Sundanese': 'sun', \
145
+ 'Swahili': 'swa','Swedish': 'swe','Syriac': 'syr','Tajik': 'tgk','Tamil': 'tam', \
146
+ 'Tatar':'tat','Telugu':'tel','Thai':'tha','Tibetan':'bod','Tigrinya':'tir','Tonga':'ton', \
147
+ 'Turkish': 'tur','Uighur; Uyghur': 'uig','Ukrainian': 'ukr','Urdu': 'urd', \
148
+ 'Uzbek - Cyrilic': 'uzb_cyrl','Uzbek': 'uzb','Vietnamese': 'vie','Welsh': 'cym', \
149
+ 'Western Frisian': 'fry','Yiddish': 'yid','Yoruba': 'yor'}
150
+
151
+ out_list_dict_lang = [out_dict_lang_easyocr, out_dict_lang_ppocr, out_dict_lang_mmocr, \
152
+ out_dict_lang_tesseract]
153
+
154
+ # Initialization of detection form
155
+ if 'columns_size' not in st.session_state:
156
+ st.session_state.columns_size = [2] + [1 for x in out_reader_type_list[1:]]
157
+ if 'column_width' not in st.session_state:
158
+ st.session_state.column_width = [400] + [300 for x in out_reader_type_list[1:]]
159
+ if 'columns_color' not in st.session_state:
160
+ st.session_state.columns_color = ["rgb(228,26,28)"] + \
161
+ ["rgb(0,0,0)" for x in out_reader_type_list[1:]]
162
+ if 'list_coordinates' not in st.session_state:
163
+ st.session_state.list_coordinates = []
164
+
165
+ # Confidence color scale
166
+ out_list_confid = list(np.arange(0,101,1))
167
+ out_list_grad = mcp.gen_color_normalized(cmap="Greens",data_arr=np.array(out_list_confid))
168
+ out_dict_back_colors = {out_list_confid[i]: out_list_grad[i] \
169
+ for i in range(len(out_list_confid))}
170
+
171
+ list_y = [1 for i in out_list_confid]
172
+ df_confid = pd.DataFrame({'% confidence scale': out_list_confid, 'y': list_y})
173
+
174
+ out_fig = px.scatter(df_confid, x='% confidence scale', y='y', \
175
+ hover_data={'% confidence scale': True, 'y': False},
176
+ color=out_dict_back_colors.values(), range_y=[0.9,1.1], range_x=[0,100],
177
+ color_discrete_map="identity",height=50,symbol='y',symbol_sequence=['square'])
178
+ out_fig.update_xaxes(showticklabels=False)
179
+ out_fig.update_yaxes(showticklabels=False, range=[0.1, 1.1], visible=False)
180
+ out_fig.update_traces(marker_size=50)
181
+ out_fig.update_layout(paper_bgcolor="white", margin=dict(b=0,r=0,t=0,l=0), xaxis_side="top", \
182
+ showlegend=False)
183
+
184
+ return out_reader_type_list, out_reader_type_dict, out_list_dict_lang, \
185
+ out_cols_size, out_dict_back_colors, out_fig
186
+
187
+ ###
188
+ @st.experimental_memo(show_spinner=False)
189
+ def init_easyocr(in_params):
190
+ """Initialization of easyOCR reader
191
+
192
+ Args:
193
+ in_params (list): list with the language
194
+
195
+ Returns:
196
+ easyocr reader: the easyocr reader instance
197
+ """
198
+ out_ocr = easyocr.Reader(in_params)
199
+ return out_ocr
200
+
201
+ ###
202
+ @st.cache(show_spinner=False)
203
+ def init_ppocr(in_params):
204
+ """Initialization of PPOCR reader
205
+
206
+ Args:
207
+ in_params (dict): dict with parameters
208
+
209
+ Returns:
210
+ ppocr reader: the ppocr reader instance
211
+ """
212
+ out_ocr = PaddleOCR(lang=in_params[0], **in_params[1])
213
+ return out_ocr
214
+
215
+ ###
216
+ @st.experimental_memo(show_spinner=False)
217
+ def init_mmocr(in_params):
218
+ """Initialization of MMOCR reader
219
+
220
+ Args:
221
+ in_params (dict): dict with parameters
222
+
223
+ Returns:
224
+ mmocr reader: the ppocr reader instance
225
+ """
226
+ out_ocr = MMOCR(recog=None, **in_params[1])
227
+ return out_ocr
228
+
229
+ ###
230
+ def init_readers(in_list_params):
231
+ """Initialization of the readers, and return them as list
232
+
233
+ Args:
234
+ in_list_params (list): list of dicts of parameters for each reader
235
+
236
+ Returns:
237
+ list: list of the reader's instances
238
+ """
239
+ # Instantiations of the readers :
240
+ # - EasyOCR
241
+ with st.spinner("EasyOCR reader initialization in progress ..."):
242
+ reader_easyocr = init_easyocr([in_list_params[0][0]])
243
+
244
+ # - PPOCR
245
+ # Paddleocr
246
+ with st.spinner("PPOCR reader initialization in progress ..."):
247
+ reader_ppocr = init_ppocr(in_list_params[1])
248
+
249
+ # - MMOCR
250
+ with st.spinner("MMOCR reader initialization in progress ..."):
251
+ reader_mmocr = init_mmocr(in_list_params[2])
252
+
253
+ out_list_readers = [reader_easyocr, reader_ppocr, reader_mmocr]
254
+
255
+ return out_list_readers
256
+
257
+ ###
258
+ def load_image(in_image_file):
259
+ """Load input file and open it
260
+
261
+ Args:
262
+ in_image_file (string or Streamlit UploadedFile): image to consider
263
+
264
+ Returns:
265
+ string : locally saved image path (img.)
266
+ PIL.Image : input file opened with Pillow
267
+ matrix : input file opened with Opencv
268
+ """
269
+
270
+ #if isinstance(in_image_file, str):
271
+ # out_image_path = "img."+in_image_file.split('.')[-1]
272
+ #else:
273
+ # out_image_path = "img."+in_image_file.name.split('.')[-1]
274
+
275
+ if isinstance(in_image_file, str):
276
+ out_image_path = "tmp_"+in_image_file
277
+ else:
278
+ out_image_path = "tmp_"+in_image_file.name
279
+
280
+ img = Image.open(in_image_file)
281
+ img_saved = img.save(out_image_path)
282
+
283
+ # Read image
284
+ out_image_orig = Image.open(out_image_path)
285
+ out_image_cv2 = cv2.cvtColor(cv2.imread(out_image_path), cv2.COLOR_BGR2RGB)
286
+
287
+ return out_image_path, out_image_orig, out_image_cv2
288
+
289
+ ###
290
+ @st.experimental_memo(show_spinner=False)
291
+ def easyocr_detect(_in_reader, in_image_path, in_params):
292
+ """Detection with EasyOCR
293
+
294
+ Args:
295
+ _in_reader (EasyOCR reader) : the previously initialized instance
296
+ in_image_path (string ) : locally saved image path
297
+ in_params (list) : list with the parameters for detection
298
+
299
+ Returns:
300
+ list : list of the boxes coordinates
301
+ exception on error, string 'OK' otherwise
302
+ """
303
+ try:
304
+ dict_param = in_params[1]
305
+ detection_result = _in_reader.detect(in_image_path,
306
+ #width_ths=0.7,
307
+ #mag_ratio=1.5
308
+ **dict_param
309
+ )
310
+ easyocr_coordinates = detection_result[0][0]
311
+
312
+ # The format of the coordinate is as follows: [x_min, x_max, y_min, y_max]
313
+ # Format boxes coordinates for draw
314
+ out_easyocr_boxes_coordinates = list(map(easyocr_coord_convert, easyocr_coordinates))
315
+ out_status = 'OK'
316
+ except Exception as e:
317
+ out_easyocr_boxes_coordinates = []
318
+ out_status = e
319
+
320
+ return out_easyocr_boxes_coordinates, out_status
321
+
322
+ ###
323
+ @st.experimental_memo(show_spinner=False)
324
+ def ppocr_detect(_in_reader, in_image_path):
325
+ """Detection with PPOCR
326
+
327
+ Args:
328
+ _in_reader (PPOCR reader) : the previously initialized instance
329
+ in_image_path (string ) : locally saved image path
330
+
331
+ Returns:
332
+ list : list of the boxes coordinates
333
+ exception on error, string 'OK' otherwise
334
+ """
335
+ # PPOCR detection method
336
+ try:
337
+ out_ppocr_boxes_coordinates = _in_reader.ocr(in_image_path, rec=False)
338
+ out_status = 'OK'
339
+ except Exception as e:
340
+ out_ppocr_boxes_coordinates = []
341
+ out_status = e
342
+
343
+ return out_ppocr_boxes_coordinates, out_status
344
+
345
+ ###
346
+ @st.experimental_memo(show_spinner=False)
347
+ def mmocr_detect(_in_reader, in_image_path):
348
+ """Detection with MMOCR
349
+
350
+ Args:
351
+ _in_reader (EasyORC reader) : the previously initialized instance
352
+ in_image_path (string) : locally saved image path
353
+ in_params (list) : list with the parameters
354
+
355
+ Returns:
356
+ list : list of the boxes coordinates
357
+ exception on error, string 'OK' otherwise
358
+ """
359
+ # MMOCR detection method
360
+ out_mmocr_boxes_coordinates = []
361
+ try:
362
+ det_result = _in_reader.readtext(in_image_path, details=True)
363
+ bboxes_list = [res['boundary_result'] for res in det_result]
364
+ for bboxes in bboxes_list:
365
+ for bbox in bboxes:
366
+ if len(bbox) > 9:
367
+ min_x = min(bbox[0:-1:2])
368
+ min_y = min(bbox[1:-1:2])
369
+ max_x = max(bbox[0:-1:2])
370
+ max_y = max(bbox[1:-1:2])
371
+ #box = [min_x, min_y, max_x, min_y, max_x, max_y, min_x, max_y]
372
+ else:
373
+ min_x = min(bbox[0:-1:2])
374
+ min_y = min(bbox[1::2])
375
+ max_x = max(bbox[0:-1:2])
376
+ max_y = max(bbox[1::2])
377
+ box4 = [ [min_x, min_y], [max_x, min_y], [max_x, max_y], [min_x, max_y] ]
378
+ out_mmocr_boxes_coordinates.append(box4)
379
+ out_status = 'OK'
380
+ except Exception as e:
381
+ out_status = e
382
+
383
+ return out_mmocr_boxes_coordinates, out_status
384
+
385
+ ###
386
+ def cropped_1box(in_box, in_img):
387
+ """Construction of an cropped image corresponding to an area of the initial image
388
+
389
+ Args:
390
+ in_box (list) : box with coordinates
391
+ in_img (matrix) : image
392
+
393
+ Returns:
394
+ matrix : cropped image
395
+ """
396
+ box_ar = np.array(in_box).astype(np.int64)
397
+ x_min = box_ar[:, 0].min()
398
+ x_max = box_ar[:, 0].max()
399
+ y_min = box_ar[:, 1].min()
400
+ y_max = box_ar[:, 1].max()
401
+ out_cropped = in_img[y_min:y_max, x_min:x_max]
402
+
403
+ return out_cropped
404
+
405
+ ###
406
+ @st.experimental_memo(show_spinner=False)
407
+ def tesserocr_detect(in_image_path, _in_img, in_params):
408
+ """Detection with Tesseract
409
+
410
+ Args:
411
+ in_image_path (string) : locally saved image path
412
+ _in_img (PIL.Image) : image to consider
413
+ in_params (list) : list with the parameters for detection
414
+
415
+ Returns:
416
+ list : list of the boxes coordinates
417
+ exception on error, string 'OK' otherwise
418
+ """
419
+ try:
420
+ dict_param = in_params[1]
421
+ df_res = pytesseract.image_to_data(_in_img, **dict_param, output_type=Output.DATAFRAME)
422
+
423
+ df_res['box'] = df_res.apply(lambda d: [[d['left'], d['top']], \
424
+ [d['left'] + d['width'], d['top']], \
425
+ [d['left'] + d['width'], d['top'] + d['height']], \
426
+ [d['left'], d['top'] + d['height']], \
427
+ ], axis=1)
428
+ out_tesserocr_boxes_coordinates = df_res[df_res.word_num > 0]['box'].to_list()
429
+ out_status = 'OK'
430
+ except Exception as e:
431
+ out_tesserocr_boxes_coordinates = []
432
+ out_status = e
433
+
434
+ return out_tesserocr_boxes_coordinates, out_status
435
+
436
+ ###
437
+ @st.experimental_memo(show_spinner=False)
438
+ def process_detect(in_image_path, _in_list_images, _in_list_readers, in_list_params, in_color):
439
+ """Detection process for each OCR solution
440
+
441
+ Args:
442
+ in_image_path (string) : locally saved image path
443
+ _in_list_images (list) : list of original image
444
+ _in_list_readers (list) : list with previously initialized reader's instances
445
+ in_list_params (list) : list with dict parameters for each OCR solution
446
+ in_color (tuple) : color for boxes around text
447
+
448
+ Returns:
449
+ list: list of detection results images
450
+ list: list of boxes coordinates
451
+ """
452
+ ## ------- EasyOCR Text detection
453
+ with st.spinner('EasyOCR Text detection in progress ...'):
454
+ easyocr_boxes_coordinates,easyocr_status = easyocr_detect(_in_list_readers[0], \
455
+ in_image_path, in_list_params[0])
456
+ # Visualization
457
+ if easyocr_boxes_coordinates:
458
+ easyocr_image_detect = draw_detected(_in_list_images[0], easyocr_boxes_coordinates, \
459
+ in_color, 'None', 3)
460
+ else:
461
+ easyocr_boxes_coordinates = easyocr_status
462
+ ##
463
+
464
+ ## ------- PPOCR Text detection
465
+ with st.spinner('PPOCR Text detection in progress ...'):
466
+ ppocr_boxes_coordinates, ppocr_status = ppocr_detect(_in_list_readers[1], in_image_path)
467
+ # Visualization
468
+ if ppocr_boxes_coordinates:
469
+ ppocr_image_detect = draw_detected(_in_list_images[0], ppocr_boxes_coordinates, \
470
+ in_color, 'None', 3)
471
+ else:
472
+ ppocr_image_detect = ppocr_status
473
+ ##
474
+
475
+ ## ------- MMOCR Text detection
476
+ with st.spinner('MMOCR Text detection in progress ...'):
477
+ mmocr_boxes_coordinates, mmocr_status = mmocr_detect(_in_list_readers[2], in_image_path)
478
+ # Visualization
479
+ if mmocr_boxes_coordinates:
480
+ mmocr_image_detect = draw_detected(_in_list_images[0], mmocr_boxes_coordinates, \
481
+ in_color, 'None', 3)
482
+ else:
483
+ mmocr_image_detect = mmocr_status
484
+ ##
485
+
486
+ ## ------- Tesseract Text detection
487
+ with st.spinner('Tesseract Text detection in progress ...'):
488
+ tesserocr_boxes_coordinates, tesserocr_status = tesserocr_detect(in_image_path, \
489
+ _in_list_images[0], \
490
+ in_list_params[3])
491
+ # Visualization
492
+ if tesserocr_status == 'OK':
493
+ tesserocr_image_detect = draw_detected(_in_list_images[0],tesserocr_boxes_coordinates,\
494
+ in_color, 'None', 3)
495
+ else:
496
+ tesserocr_image_detect = tesserocr_status
497
+ ##
498
+ #
499
+ out_list_images = _in_list_images + [easyocr_image_detect, ppocr_image_detect, \
500
+ mmocr_image_detect, tesserocr_image_detect]
501
+ out_list_coordinates = [easyocr_boxes_coordinates, ppocr_boxes_coordinates, \
502
+ mmocr_boxes_coordinates, tesserocr_boxes_coordinates]
503
+ #
504
+
505
+ return out_list_images, out_list_coordinates
506
+
507
+ ###
508
+ def draw_detected(in_image, in_boxes_coordinates, in_color, posit='None', in_thickness=4):
509
+ """Draw boxes around detected text
510
+
511
+ Args:
512
+ in_image (PIL.Image) : original image
513
+ in_boxes_coordinates (list) : boxes coordinates, from top to bottom and from left to right
514
+ [ [ [x_min, y_min], [x_max, y_min], [x_max, y_max], [x_min, y_max] ],
515
+ [ ... ]
516
+ ]
517
+ in_color (tuple) : color for boxes around text
518
+ posit (str, optional) : position for text. Defaults to 'None'.
519
+ in_thickness (int, optional): thickness of the box. Defaults to 4.
520
+
521
+ Returns:
522
+ PIL.Image : original image with detected areas
523
+ """
524
+ work_img = in_image.copy()
525
+ if in_boxes_coordinates:
526
+ font = cv2.FONT_HERSHEY_SIMPLEX
527
+ for ind_box, box in enumerate(in_boxes_coordinates):
528
+ box = np.reshape(np.array(box), [-1, 1, 2]).astype(np.int64)
529
+ work_img = cv2.polylines(np.array(work_img), [box], True, in_color, in_thickness)
530
+ if posit != 'None':
531
+ if posit == 'top_left':
532
+ pos = tuple(box[0][0])
533
+ elif posit == 'top_right':
534
+ pos = tuple(box[1][0])
535
+ work_img = cv2.putText(work_img, str(ind_box+1), pos, font, 5.5, color, \
536
+ in_thickness,cv2.LINE_AA)
537
+
538
+ out_image_drawn = Image.fromarray(work_img)
539
+ else:
540
+ out_image_drawn = work_img
541
+
542
+ return out_image_drawn
543
+
544
+ ###
545
+ @st.experimental_memo(show_spinner=False)
546
+ def get_cropped(in_boxes_coordinates, in_image_cv):
547
+ """Construct list of cropped images corresponding of the input boxes coordinates list
548
+
549
+ Args:
550
+ in_boxes_coordinates (list) : list of boxes coordinates
551
+ in_image_cv (matrix) : original image
552
+
553
+ Returns:
554
+ list : list with cropped images
555
+ """
556
+ out_list_images = []
557
+ for box in in_boxes_coordinates:
558
+ cropped = cropped_1box(box, in_image_cv)
559
+ out_list_images.append(cropped)
560
+ return out_list_images
561
+
562
+ ###
563
+ def process_recog(in_list_readers, in_image_cv, in_boxes_coordinates, in_list_dict_params):
564
+ """Recognition process for each OCR solution
565
+
566
+ Args:
567
+ in_list_readers (list) : list with previously initialized reader's instances
568
+ in_image_cv (matrix) : original image
569
+ in_boxes_coordinates (list) : list of boxes coordinates
570
+ in_list_dict_params (list) : list with dict parameters for each OCR solution
571
+
572
+ Returns:
573
+ data frame : results for each OCR solution, except Tesseract
574
+ data frame : results for Tesseract
575
+ list : status for each recognition (exception or 'OK')
576
+ """
577
+ out_df_results = pd.DataFrame([])
578
+
579
+ list_text_easyocr = []
580
+ list_confidence_easyocr = []
581
+ list_text_ppocr = []
582
+ list_confidence_ppocr = []
583
+ list_text_mmocr = []
584
+ list_confidence_mmocr = []
585
+
586
+ # Create cropped images from detection
587
+ list_cropped_images = get_cropped(in_boxes_coordinates, in_image_cv)
588
+
589
+ # Recognize with EasyOCR
590
+ with st.spinner('EasyOCR Text recognition in progress ...'):
591
+ list_text_easyocr, list_confidence_easyocr, status_easyocr = \
592
+ easyocr_recog(list_cropped_images, in_list_readers[0], in_list_dict_params[0])
593
+ ##
594
+
595
+ # Recognize with PPOCR
596
+ with st.spinner('PPOCR Text recognition in progress ...'):
597
+ list_text_ppocr, list_confidence_ppocr, status_ppocr = \
598
+ ppocr_recog(list_cropped_images, in_list_dict_params[1])
599
+ ##
600
+
601
+ # Recognize with MMOCR
602
+ with st.spinner('MMOCR Text recognition in progress ...'):
603
+ list_text_mmocr, list_confidence_mmocr, status_mmocr = \
604
+ mmocr_recog(list_cropped_images, in_list_dict_params[2])
605
+ ##
606
+
607
+ # Recognize with Tesseract
608
+ with st.spinner('Tesseract Text recognition in progress ...'):
609
+ out_df_results_tesseract, status_tesseract = \
610
+ tesserocr_recog(in_image_cv, in_list_dict_params[3], len(list_cropped_images))
611
+ ##
612
+
613
+ # Create results data frame
614
+ out_df_results = pd.DataFrame({'cropped_image': list_cropped_images,
615
+ 'text_easyocr': list_text_easyocr,
616
+ 'confidence_easyocr': list_confidence_easyocr,
617
+ 'text_ppocr': list_text_ppocr,
618
+ 'confidence_ppocr': list_confidence_ppocr,
619
+ 'text_mmocr': list_text_mmocr,
620
+ 'confidence_mmocr': list_confidence_mmocr
621
+ }
622
+ )
623
+
624
+ out_list_reco_status = [status_easyocr, status_ppocr, status_mmocr, status_tesseract]
625
+
626
+ return out_df_results, out_df_results_tesseract, out_list_reco_status
627
+
628
+ ###
629
+ @st.experimental_memo(suppress_st_warning=True, show_spinner=False)
630
+ def easyocr_recog(in_list_images, _in_reader_easyocr, in_params):
631
+ """Recognition with EasyOCR
632
+
633
+ Args:
634
+ in_list_images (list) : list of cropped images
635
+ _in_reader_easyocr (EasyOCR reader) : the previously initialized instance
636
+ in_params (dict) : parameters for recognition
637
+
638
+ Returns:
639
+ list : list of recognized text
640
+ list : list of recognition confidence
641
+ string/Exception : recognition status
642
+ """
643
+ progress_bar = st.progress(0)
644
+ out_list_text_easyocr = []
645
+ out_list_confidence_easyocr = []
646
+ ## ------- EasyOCR Text recognition
647
+ try:
648
+ step = 0*len(in_list_images) # first recognition process
649
+ nb_steps = 4 * len(in_list_images)
650
+ for ind_img, cropped in enumerate(in_list_images):
651
+ result = _in_reader_easyocr.recognize(cropped, **in_params)
652
+ try:
653
+ out_list_text_easyocr.append(result[0][1])
654
+ out_list_confidence_easyocr.append(np.round(100*result[0][2], 1))
655
+ except:
656
+ out_list_text_easyocr.append('Not recognize')
657
+ out_list_confidence_easyocr.append(100.)
658
+ progress_bar.progress((step+ind_img+1)/nb_steps)
659
+ out_status = 'OK'
660
+ except Exception as e:
661
+ out_status = e
662
+ progress_bar.empty()
663
+
664
+ return out_list_text_easyocr, out_list_confidence_easyocr, out_status
665
+
666
+ ###
667
+ @st.experimental_memo(suppress_st_warning=True, show_spinner=False)
668
+ def ppocr_recog(in_list_images, in_params):
669
+ """Recognition with PPOCR
670
+
671
+ Args:
672
+ in_list_images (list) : list of cropped images
673
+ in_params (dict) : parameters for recognition
674
+
675
+ Returns:
676
+ list : list of recognized text
677
+ list : list of recognition confidence
678
+ string/Exception : recognition status
679
+ """
680
+ ## ------- PPOCR Text recognition
681
+ out_list_text_ppocr = []
682
+ out_list_confidence_ppocr = []
683
+ try:
684
+ reader_ppocr = PaddleOCR(**in_params)
685
+ step = 1*len(in_list_images) # second recognition process
686
+ nb_steps = 4 * len(in_list_images)
687
+ progress_bar = st.progress(step/nb_steps)
688
+
689
+ for ind_img, cropped in enumerate(in_list_images):
690
+ result = reader_ppocr.ocr(cropped, det=False, cls=False)
691
+ try:
692
+ out_list_text_ppocr.append(result[0][0])
693
+ out_list_confidence_ppocr.append(np.round(100*result[0][1], 1))
694
+ except:
695
+ out_list_text_ppocr.append('Not recognize')
696
+ out_list_confidence_ppocr.append(100.)
697
+ progress_bar.progress((step+ind_img+1)/nb_steps)
698
+ out_status = 'OK'
699
+ except Exception as e:
700
+ out_status = e
701
+ progress_bar.empty()
702
+
703
+ return out_list_text_ppocr, out_list_confidence_ppocr, out_status
704
+
705
+ ###
706
+ @st.experimental_memo(suppress_st_warning=True, show_spinner=False)
707
+ def mmocr_recog(in_list_images, in_params):
708
+ """Recognition with MMOCR
709
+
710
+ Args:
711
+ in_list_images (list) : list of cropped images
712
+ in_params (dict) : parameters for recognition
713
+
714
+ Returns:
715
+ list : list of recognized text
716
+ list : list of recognition confidence
717
+ string/Exception : recognition status
718
+ """
719
+ ## ------- MMOCR Text recognition
720
+ out_list_text_mmocr = []
721
+ out_list_confidence_mmocr = []
722
+ try:
723
+ reader_mmocr = MMOCR(det=None, **in_params)
724
+ step = 2*len(in_list_images) # third recognition process
725
+ nb_steps = 4 * len(in_list_images)
726
+ progress_bar = st.progress(step/nb_steps)
727
+
728
+ for ind_img, cropped in enumerate(in_list_images):
729
+ result = reader_mmocr.readtext(cropped, details=True)
730
+ try:
731
+ out_list_text_mmocr.append(result[0]['text'])
732
+ out_list_confidence_mmocr.append(np.round(100* \
733
+ (np.array(result[0]['score']).mean()), 1))
734
+ except:
735
+ out_list_text_mmocr.append('Not recognize')
736
+ out_list_confidence_mmocr.append(100.)
737
+ progress_bar.progress((step+ind_img+1)/nb_steps)
738
+ out_status = 'OK'
739
+ except Exception as e:
740
+ out_status = e
741
+ progress_bar.empty()
742
+
743
+ return out_list_text_mmocr, out_list_confidence_mmocr, out_status
744
+
745
+ ###
746
+ @st.experimental_memo(suppress_st_warning=True, show_spinner=False)
747
+ def tesserocr_recog(in_img, in_params, in_nb_images):
748
+ """Recognition with Tesseract
749
+
750
+ Args:
751
+ in_image_cv (matrix) : original image
752
+ in_params (dict) : parameters for recognition
753
+ in_nb_images : nb cropped images (used for progress bar)
754
+
755
+ Returns:
756
+ Pandas data frame : recognition results
757
+ string/Exception : recognition status
758
+ """
759
+ ## ------- Tesseract Text recognition
760
+ step = 3*in_nb_images # fourth recognition process
761
+ nb_steps = 4 * in_nb_images
762
+ progress_bar = st.progress(step/nb_steps)
763
+
764
+ try:
765
+ out_df_result = pytesseract.image_to_data(in_img, **in_params,output_type=Output.DATAFRAME)
766
+
767
+ out_df_result['box'] = out_df_result.apply(lambda d: [[d['left'], d['top']], \
768
+ [d['left'] + d['width'], d['top']], \
769
+ [d['left']+d['width'], d['top']+d['height']], \
770
+ [d['left'], d['top'] + d['height']], \
771
+ ], axis=1)
772
+ out_df_result['cropped'] = out_df_result['box'].apply(lambda b: cropped_1box(b, in_img))
773
+ out_df_result = out_df_result[(out_df_result.word_num > 0) & (out_df_result.text != ' ')] \
774
+ .reset_index(drop=True)
775
+ out_status = 'OK'
776
+ except Exception as e:
777
+ out_df_result = pd.DataFrame([])
778
+ out_status = e
779
+
780
+ progress_bar.progress(1.)
781
+
782
+ return out_df_result, out_status
783
+
784
+ ###
785
+ def draw_reco_images(in_image, in_boxes_coordinates, in_list_texts, in_list_confid, \
786
+ in_dict_back_colors, in_df_results_tesseract, in_reader_type_list, \
787
+ in_font_scale=1, in_conf_threshold=65):
788
+ """Draw recognized text on original image, for each OCR solution used
789
+
790
+ Args:
791
+ in_image (matrix) : original image
792
+ in_boxes_coordinates (list) : list of boxes coordinates
793
+ in_list_texts (list): list of recognized text for each recognizer (except Tesseract)
794
+ in_list_confid (list): list of recognition confidence for each recognizer (except Tesseract)
795
+ in_df_results_tesseract (Pandas data frame): Tesseract recognition results
796
+ in_font_scale (int, optional): text font scale. Defaults to 3.
797
+
798
+ Returns:
799
+ shows the results container
800
+ """
801
+ img = in_image.copy()
802
+ nb_readers = len(in_reader_type_list)
803
+ list_reco_images = [img.copy() for i in range(nb_readers)]
804
+
805
+ for num, box_ in enumerate(in_boxes_coordinates):
806
+ box = np.array(box_).astype(np.int64)
807
+
808
+ # For each box : draw the results of each recognizer
809
+ for ind_r in range(nb_readers-1):
810
+ confid = np.round(in_list_confid[ind_r][num], 0)
811
+ rgb_color = ImageColor.getcolor(in_dict_back_colors[confid], "RGB")
812
+ if confid < in_conf_threshold:
813
+ text_color = (0, 0, 0)
814
+ else:
815
+ text_color = (255, 255, 255)
816
+
817
+ list_reco_images[ind_r] = cv2.rectangle(list_reco_images[ind_r], \
818
+ (box[0][0], box[0][1]), \
819
+ (box[2][0], box[2][1]), rgb_color, -1)
820
+ list_reco_images[ind_r] = cv2.putText(list_reco_images[ind_r], \
821
+ in_list_texts[ind_r][num], \
822
+ (box[0][0],int(np.round((box[0][1]+box[2][1])/2,0))), \
823
+ cv2.FONT_HERSHEY_DUPLEX, in_font_scale, text_color, 2)
824
+
825
+ # Add Tesseract process
826
+ if not in_df_results_tesseract.empty:
827
+ ind_tessocr = nb_readers-1
828
+ for num, box_ in enumerate(in_df_results_tesseract['box'].to_list()):
829
+ box = np.array(box_).astype(np.int64)
830
+ confid = np.round(in_df_results_tesseract.iloc[num]['conf'], 0)
831
+ rgb_color = ImageColor.getcolor(in_dict_back_colors[confid], "RGB")
832
+ if confid < in_conf_threshold:
833
+ text_color = (0, 0, 0)
834
+ else:
835
+ text_color = (255, 255, 255)
836
+
837
+ list_reco_images[ind_tessocr] = \
838
+ cv2.rectangle(list_reco_images[ind_tessocr], (box[0][0], box[0][1]), \
839
+ (box[2][0], box[2][1]), rgb_color, -1)
840
+ try:
841
+ list_reco_images[ind_tessocr] = \
842
+ cv2.putText(list_reco_images[ind_tessocr], \
843
+ in_df_results_tesseract.iloc[num]['text'], \
844
+ (box[0][0],int(np.round((box[0][1]+box[2][1])/2,0))), \
845
+ cv2.FONT_HERSHEY_DUPLEX, in_font_scale, text_color, 2)
846
+
847
+ except:
848
+
849
+ pass
850
+
851
+ with show_reco.container():
852
+ # Draw the results, 2 images per line
853
+ reco_lines = math.ceil(len(in_reader_type_list) / 2)
854
+ column_width = 400
855
+ for ind_lig in range(0, reco_lines+1, 2):
856
+ cols = st.columns(2)
857
+ for ind_col in range(2):
858
+ ind = ind_lig + ind_col
859
+ if ind <= len(in_reader_type_list):
860
+ if in_reader_type_list[ind] == 'Tesseract':
861
+ column_title = '<p style="font-size: 20px;color:rgb(0,0,0); \
862
+ ">Recognition with ' + in_reader_type_list[ind] + \
863
+ '<sp style="font-size: 17px"> (with its own detector) \
864
+ </sp></p>'
865
+ else:
866
+ column_title = '<p style="font-size: 20px;color:rgb(0,0,0); \
867
+ ">Recognition with ' + \
868
+ in_reader_type_list[ind]+ '</p>'
869
+ cols[ind_col].markdown(column_title, unsafe_allow_html=True)
870
+ if st.session_state.list_reco_status[ind] == 'OK':
871
+ cols[ind_col].image(list_reco_images[ind], \
872
+ width=column_width, use_column_width=True)
873
+ else:
874
+ cols[ind_col].write(list_reco_status[ind], \
875
+ use_column_width=True)
876
+
877
+ st.markdown(' 💡 Bad font size? you can adjust it below and refresh:')
878
+
879
+ ###
880
+ def highlight():
881
+ """ Highlight choosen detector results
882
+ """
883
+ with show_detect.container():
884
+ columns_size = [1 for x in reader_type_list]
885
+ column_width = [300 for x in reader_type_list]
886
+ columns_color = ["rgb(0,0,0)" for x in reader_type_list]
887
+ columns_size[reader_type_dict[st.session_state.detect_reader]] = 2
888
+ column_width[reader_type_dict[st.session_state.detect_reader]] = 400
889
+ columns_color[reader_type_dict[st.session_state.detect_reader]] = "rgb(228,26,28)"
890
+ columns = st.columns(columns_size, ) #gap='medium')
891
+
892
+ for ind_col, col in enumerate(columns):
893
+ column_title = '<p style="font-size: 20px;color:'+columns_color[ind_col] + \
894
+ ';">Detection with ' + reader_type_list[ind_col]+ '</p>'
895
+ col.markdown(column_title, unsafe_allow_html=True)
896
+ if isinstance(list_images[ind_col+2], PIL.Image.Image):
897
+ col.image(list_images[ind_col+2], width=column_width[ind_col], \
898
+ use_column_width=True)
899
+ else:
900
+ col.write(list_images[ind_col+2], use_column_width=True)
901
+ st.session_state.columns_size = columns_size
902
+ st.session_state.column_width = column_width
903
+ st.session_state.columns_color = columns_color
904
+
905
+ ###
906
+ @st.cache(show_spinner=False)
907
+ def get_demo():
908
+ """Get the demo files
909
+
910
+ Returns:
911
+ PIL.Image : input file opened with Pillow
912
+ PIL.Image : input file opened with Pillow
913
+ """
914
+
915
+ out_img_demo_1 = Image.open("img_demo_1.jpg")
916
+ out_img_demo_2 = Image.open("img_demo_2.jpg")
917
+
918
+ return out_img_demo_1, out_img_demo_2
919
+
920
+ ###
921
+ def raz():
922
+ st.session_state.list_coordinates = []
923
+ st.session_state.list_images = []
924
+ st.session_state.detect_reader = reader_type_list[0]
925
+
926
+ st.session_state.columns_size = [2] + [1 for x in reader_type_list[1:]]
927
+ st.session_state.column_width = [400] + [300 for x in reader_type_list[1:]]
928
+ st.session_state.columns_color = ["rgb(228,26,28)"] + \
929
+ ["rgb(0,0,0)" for x in reader_type_list[1:]]
930
+
931
+ # Clear caches
932
+ easyocr_detect.clear()
933
+ ppocr_detect.clear()
934
+ mmocr_detect.clear()
935
+ tesserocr_detect.clear()
936
+ process_detect.clear()
937
+ get_cropped.clear()
938
+ easyocr_recog.clear()
939
+ ppocr_recog.clear()
940
+ mmocr_recog.clear()
941
+ tesserocr_recog.clear()
942
+
943
+
944
+ ##----------- Initializations ---------------------------------------------------------------------
945
+ #print("PID : ", os.getpid())
946
+
947
+ st.title("OCR solutions comparator")
948
+ st.markdown("##### *EasyOCR, PPOCR, MMOCR, Tesseract*")
949
+ #st.markdown("#### PID : " + str(os.getpid()))
950
+
951
+ # Initializations
952
+ with st.spinner("Initializations in progress ..."):
953
+ reader_type_list, reader_type_dict, list_dict_lang, \
954
+ cols_size, dict_back_colors, fig_colorscale = initializations()
955
+ img_demo_1, img_demo_2 = get_demo()
956
+
957
+ ##----------- Choose language & image -------------------------------------------------------------
958
+ st.markdown("#### Choose languages for the text recognition:")
959
+ lang_col = st.columns(4)
960
+ easyocr_key_lang = lang_col[0].selectbox(reader_type_list[0]+" :", list_dict_lang[0].keys(), 26)
961
+ easyocr_lang = list_dict_lang[0][easyocr_key_lang]
962
+ ppocr_key_lang = lang_col[1].selectbox(reader_type_list[1]+" :", list_dict_lang[1].keys(), 22)
963
+ ppocr_lang = list_dict_lang[1][ppocr_key_lang]
964
+ mmocr_key_lang = lang_col[2].selectbox(reader_type_list[2]+" :", list_dict_lang[2].keys(), 0)
965
+ mmocr_lang = list_dict_lang[2][mmocr_key_lang]
966
+ tesserocr_key_lang = lang_col[3].selectbox(reader_type_list[3]+" :", list_dict_lang[3].keys(), 35)
967
+ tesserocr_lang = list_dict_lang[3][tesserocr_key_lang]
968
+
969
+ st.markdown("#### Choose picture:")
970
+ cols_pict = st.columns([1, 2])
971
+ img_typ = cols_pict[0].radio("", ['Upload file', 'Take a picture', 'Use a demo file'], \
972
+ index=0, on_change=raz)
973
+
974
+ if img_typ == 'Upload file':
975
+ image_file = cols_pict[1].file_uploader("Upload a file:", type=["jpg","jpeg"], on_change=raz)
976
+ if img_typ == 'Take a picture':
977
+ image_file = cols_pict[1].camera_input("Take a picture:", on_change=raz)
978
+ if img_typ == 'Use a demo file':
979
+ with st.expander('Choose a demo file:', expanded=True):
980
+ demo_used = st.radio('', ['File 1', 'File 2'], index=0, \
981
+ horizontal=True, on_change=raz)
982
+ cols_demo = st.columns([1, 2])
983
+ cols_demo[0].markdown('###### File 1')
984
+ cols_demo[0].image(img_demo_1, width=150)
985
+ cols_demo[1].markdown('###### File 2')
986
+ cols_demo[1].image(img_demo_2, width=300)
987
+ if demo_used == 'File 1':
988
+ image_file = 'img_demo_1.jpg'
989
+ else:
990
+ image_file = 'img_demo_2.jpg'
991
+
992
+ ##----------- Process input image -----------------------------------------------------------------
993
+ if image_file is not None:
994
+ image_path, image_orig, image_cv2 = load_image(image_file)
995
+ list_images = [image_orig, image_cv2]
996
+
997
+ ##----------- Form with original image & hyperparameters for detectors ----------------------------
998
+ with st.form("form1"):
999
+ col1, col2 = st.columns(2, ) #gap="medium")
1000
+ col1.markdown("##### Original image")
1001
+ col1.image(list_images[0], width=400)
1002
+ col2.markdown("##### Hyperparameters values for detection")
1003
+
1004
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[0], \
1005
+ expanded=False):
1006
+ t0_min_size = st.slider("min_size", 1, 20, 10, step=1, \
1007
+ help="min_size (int, default = 10) - Filter text box smaller than \
1008
+ minimum value in pixel")
1009
+ t0_text_threshold = st.slider("text_threshold", 0.1, 1., 0.7, step=0.1, \
1010
+ help="text_threshold (float, default = 0.7) - Text confidence threshold")
1011
+ t0_low_text = st.slider("low_text", 0.1, 1., 0.4, step=0.1, \
1012
+ help="low_text (float, default = 0.4) - Text low-bound score")
1013
+ t0_link_threshold = st.slider("link_threshold", 0.1, 1., 0.4, step=0.1, \
1014
+ help="link_threshold (float, default = 0.4) - Link confidence threshold")
1015
+ t0_canvas_size = st.slider("canvas_size", 2000, 5000, 2560, step=10, \
1016
+ help='''canvas_size (int, default = 2560) \n
1017
+ Maximum e size. Image bigger than this value will be resized down''')
1018
+ t0_mag_ratio = st.slider("mag_ratio", 0.1, 5., 1., step=0.1, \
1019
+ help="mag_ratio (float, default = 1) - Image magnification ratio")
1020
+ t0_slope_ths = st.slider("slope_ths", 0.01, 1., 0.1, step=0.01, \
1021
+ help='''slope_ths (float, default = 0.1) - Maximum slope \
1022
+ (delta y/delta x) to considered merging. \n
1023
+ Low valuans tiled boxes will not be merged.''')
1024
+ t0_ycenter_ths = st.slider("ycenter_ths", 0.1, 1., 0.5, step=0.1, \
1025
+ help='''ycenter_ths (float, default = 0.5) - Maximum shift in y direction. \n
1026
+ Boxes wiifferent level should not be merged.''')
1027
+ t0_height_ths = st.slider("height_ths", 0.1, 1., 0.5, step=0.1, \
1028
+ help='''height_ths (float, default = 0.5) - Maximum different in box height. \n
1029
+ Boxes wiery different text size should not be merged.''')
1030
+ t0_width_ths = st.slider("width_ths", 0.1, 1., 0.5, step=0.1, \
1031
+ help="width_ths (float, default = 0.5) - Maximum horizontal \
1032
+ distance to merge boxes.")
1033
+ t0_add_margin = st.slider("add_margin", 0.1, 1., 0.1, step=0.1, \
1034
+ help='''add_margin (float, default = 0.1) - \
1035
+ Extend bounding boxes in all direction by certain value. \n
1036
+ This is rtant for language with complex script (E.g. Thai).''')
1037
+ t0_optimal_num_chars = st.slider("optimal_num_chars", None, 100, None, step=10, \
1038
+ help="optimal_num_chars (int, default = None) - If specified, bounding boxes \
1039
+ with estimated number of characters near this value are returned first.")
1040
+
1041
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[1], \
1042
+ expanded=False):
1043
+ t1_det_algorithm = st.selectbox('det_algorithm', ['DB'], \
1044
+ help='Type of detection algorithm selected. (default = DB)')
1045
+ t1_det_max_side_len = st.slider('det_max_side_len', 500, 2000, 960, step=10, \
1046
+ help='''The maximum size of the long side of the image. (default = 960)\n
1047
+ Limit thximum image height and width.\n
1048
+ When theg side exceeds this value, the long side will be resized to this size, and the short side \
1049
+ will be ed proportionally.''')
1050
+ t1_det_db_thresh = st.slider('det_db_thresh', 0.1, 1., 0.3, step=0.1, \
1051
+ help='''Binarization threshold value of DB output map. (default = 0.3) \n
1052
+ Used to er the binarized image of DB prediction, setting 0.-0.3 has no obvious effect on the result.''')
1053
+ t1_det_db_box_thresh = st.slider('det_db_box_thresh', 0.1, 1., 0.6, step=0.1, \
1054
+ help='''The threshold value of the DB output box. (default = 0.6) \n
1055
+ DB post-essing filter box threshold, if there is a missing box detected, it can be reduced as appropriate. \n
1056
+ Boxes sclower than this value will be discard.''')
1057
+ t1_det_db_unclip_ratio = st.slider('det_db_unclip_ratio', 1., 3.0, 1.6, step=0.1, \
1058
+ help='''The expanded ratio of DB output box. (default = 1.6) \n
1059
+ Indicatee compactness of the text box, the smaller the value, the closer the text box to the text.''')
1060
+ t1_det_east_score_thresh = st.slider('det_east_cover_thresh', 0.1, 1., 0.8, step=0.1, \
1061
+ help="Binarization threshold value of EAST output map. (default = 0.8)")
1062
+ t1_det_east_cover_thresh = st.slider('det_east_cover_thresh', 0.1, 1., 0.1, step=0.1, \
1063
+ help='''The threshold value of the EAST output box. (default = 0.1) \n
1064
+ Boxes sclower than this value will be discarded.''')
1065
+ t1_det_east_nms_thresh = st.slider('det_east_nms_thresh', 0.1, 1., 0.2, step=0.1, \
1066
+ help="The NMS threshold value of EAST model output box. (default = 0.2)")
1067
+ t1_det_db_score_mode = st.selectbox('det_db_score_mode', ['fast', 'slow'], \
1068
+ help='''slow: use polygon box to calculate bbox score, fast: use rectangle box \
1069
+ to calculate. (default = fast) \n
1070
+ Use rectlar box to calculate faster, and polygonal box more accurate for curved text area.''')
1071
+
1072
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[2], \
1073
+ expanded=False):
1074
+ t2_det = st.selectbox('det', ['DB_r18','DB_r50','DBPP_r50','DRRG','FCE_IC15', \
1075
+ 'FCE_CTW_DCNv2','MaskRCNN_CTW','MaskRCNN_IC15', \
1076
+ 'MaskRCNN_IC17', 'PANet_CTW','PANet_IC15','PS_CTW',\
1077
+ 'PS_IC15','Tesseract','TextSnake'], 10, \
1078
+ help='Text detection algorithm. (default = PANet_IC15)')
1079
+ st.write("###### *More about text detection models* 👉 \
1080
+ [here](https://mmocr.readthedocs.io/en/latest/textdet_models.html)")
1081
+ t2_merge_xdist = st.slider('merge_xdist', 1, 50, 20, step=1, \
1082
+ help='The maximum x-axis distance to merge boxes. (defaut=20)')
1083
+
1084
+ with col2.expander("Choose detection hyperparameters for " + reader_type_list[3], \
1085
+ expanded=False):
1086
+ t3_psm = st.selectbox('Page segmentation mode (psm)', \
1087
+ [' - Default', \
1088
+ ' 4 Assume a single column of text of variable sizes', \
1089
+ ' 5 Assume a single uniform block of vertically aligned text', \
1090
+ ' 6 Assume a single uniform block of text', \
1091
+ ' 7 Treat the image as a single text line', \
1092
+ ' 8 Treat the image as a single word', \
1093
+ ' 9 Treat the image as a single word in a circle', \
1094
+ '10 Treat the image as a single character', \
1095
+ '11 Sparse text. Find as much text as possible in no \
1096
+ particular order', \
1097
+ '13 Raw line. Treat the image as a single text line, \
1098
+ bypassing hacks that are Tesseract-specific'])
1099
+ t3_oem = st.selectbox('OCR engine mode', ['0 Legacy engine only', \
1100
+ '1 Neural nets LSTM engine only', \
1101
+ '2 Legacy + LSTM engines', \
1102
+ '3 Default, based on what is available'], 3)
1103
+ t3_whitelist = st.text_input('Limit tesseract to recognize only this characters :', \
1104
+ placeholder='Limit tesseract to recognize only this characters', \
1105
+ help='Example for numbers only : 0123456789')
1106
+
1107
+ color_hex = col2.color_picker('Set a color for box outlines:', '#004C99')
1108
+ color_part = color_hex.lstrip('#')
1109
+ color = tuple(int(color_part[i:i+2], 16) for i in (0, 2, 4))
1110
+
1111
+ submit_detect = st.form_submit_button("Launch detection")
1112
+
1113
+ ##----------- Process text detection --------------------------------------------------------------
1114
+ if submit_detect:
1115
+ # Process text detection
1116
+
1117
+ if t0_optimal_num_chars == 0:
1118
+ t0_optimal_num_chars = None
1119
+
1120
+ # Construct the config Tesseract parameter
1121
+ t3_config = ''
1122
+ psm = t3_psm[:2]
1123
+ if psm != ' -':
1124
+ t3_config += '--psm ' + psm.strip()
1125
+ oem = t3_oem[:1]
1126
+ if oem != '3':
1127
+ t3_config += ' --oem ' + oem
1128
+ if t3_whitelist != '':
1129
+ t3_config += ' -c tessedit_char_whitelist=' + t3_whitelist
1130
+
1131
+ list_params_det = \
1132
+ [[easyocr_lang, \
1133
+ {'min_size': t0_min_size, 'text_threshold': t0_text_threshold, \
1134
+ 'low_text': t0_low_text, 'link_threshold': t0_link_threshold, \
1135
+ 'canvas_size': t0_canvas_size, 'mag_ratio': t0_mag_ratio, \
1136
+ 'slope_ths': t0_slope_ths, 'ycenter_ths': t0_ycenter_ths, \
1137
+ 'height_ths': t0_height_ths, 'width_ths': t0_width_ths, \
1138
+ 'add_margin': t0_add_margin, 'optimal_num_chars': t0_optimal_num_chars \
1139
+ }], \
1140
+ [ppocr_lang, \
1141
+ {'det_algorithm': t1_det_algorithm, 'det_max_side_len': t1_det_max_side_len, \
1142
+ 'det_db_thresh': t1_det_db_thresh, 'det_db_box_thresh': t1_det_db_box_thresh, \
1143
+ 'det_db_unclip_ratio': t1_det_db_unclip_ratio, \
1144
+ 'det_east_score_thresh': t1_det_east_score_thresh, \
1145
+ 'det_east_cover_thresh': t1_det_east_cover_thresh, \
1146
+ 'det_east_nms_thresh': t1_det_east_nms_thresh, \
1147
+ 'det_db_score_mode': t1_det_db_score_mode}],
1148
+ [mmocr_lang, {'det': t2_det, 'merge_xdist': t2_merge_xdist}],
1149
+ [tesserocr_lang, {'lang': tesserocr_lang, 'config': t3_config}]
1150
+ ]
1151
+
1152
+ show_info1 = st.empty()
1153
+ show_info1.info("Readers initializations in progress (it may take a while) ...")
1154
+ list_readers = init_readers(list_params_det)
1155
+
1156
+ show_info1.info("Text detection in progress ...")
1157
+ list_images, list_coordinates = process_detect(image_path, list_images, list_readers, \
1158
+ list_params_det, color)
1159
+ show_info1.empty()
1160
+
1161
+ # Clear previous recognition results
1162
+ st.session_state.df_results = pd.DataFrame([])
1163
+
1164
+ st.session_state.list_readers = list_readers
1165
+ st.session_state.list_coordinates = list_coordinates
1166
+ st.session_state.list_images = list_images
1167
+ st.session_state.list_params_det = list_params_det
1168
+
1169
+ if 'columns_size' not in st.session_state:
1170
+ st.session_state.columns_size = [2] + [1 for x in reader_type_list[1:]]
1171
+ if 'column_width' not in st.session_state:
1172
+ st.session_state.column_width = [400] + [300 for x in reader_type_list[1:]]
1173
+ if 'columns_color' not in st.session_state:
1174
+ st.session_state.columns_color = ["rgb(228,26,28)"] + \
1175
+ ["rgb(0,0,0)" for x in reader_type_list[1:]]
1176
+
1177
+ if st.session_state.list_coordinates:
1178
+ list_coordinates = st.session_state.list_coordinates
1179
+ list_images = st.session_state.list_images
1180
+ list_readers = st.session_state.list_readers
1181
+ list_params_det = st.session_state.list_params_det
1182
+
1183
+ ##----------- Text detection results --------------------------------------------------------------
1184
+ st.subheader("Text detection")
1185
+ show_detect = st.empty()
1186
+ list_ok_detect = []
1187
+ with show_detect.container():
1188
+ columns = st.columns(st.session_state.columns_size, ) #gap='medium')
1189
+ for no_col, col in enumerate(columns):
1190
+ column_title = '<p style="font-size: 20px;color:' + \
1191
+ st.session_state.columns_color[no_col] + \
1192
+ ';">Detection with ' + reader_type_list[no_col]+ '</p>'
1193
+ col.markdown(column_title, unsafe_allow_html=True)
1194
+ if isinstance(list_images[no_col+2], PIL.Image.Image):
1195
+ col.image(list_images[no_col+2], width=st.session_state.column_width[no_col], \
1196
+ use_column_width=True)
1197
+ list_ok_detect.append(reader_type_list[no_col])
1198
+ else:
1199
+ col.write(list_images[no_col+2], use_column_width=True)
1200
+
1201
+ st.subheader("Text recognition")
1202
+
1203
+ st.markdown("##### Using detection performed above by:")
1204
+ st.radio('Choose the detecter:', list_ok_detect, key='detect_reader', \
1205
+ horizontal=True, on_change=highlight)
1206
+
1207
+ ##----------- Form with hyperparameters for recognition -----------------------
1208
+ st.markdown("##### Hyperparameters values for recognition:")
1209
+ with st.form("form2"):
1210
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[0], \
1211
+ expanded=False):
1212
+ t0_decoder = st.selectbox('decoder', ['greedy', 'beamsearch', 'wordbeamsearch'], \
1213
+ help="decoder (string, default = 'greedy') - options are 'greedy', \
1214
+ 'beamsearch' and 'wordbeamsearch.")
1215
+ t0_beamWidth = st.slider('beamWidth', 2, 20, 5, step=1, \
1216
+ help="beamWidth (int, default = 5) - How many beam to keep when decoder = \
1217
+ 'beamsearch' or 'wordbeamsearch'.")
1218
+ t0_batch_size = st.slider('batch_size', 1, 10, 1, step=1, \
1219
+ help="batch_size (int, default = 1) - batch_size>1 will make EasyOCR faster \
1220
+ but use more memory.")
1221
+ t0_workers = st.slider('workers', 0, 10, 0, step=1, \
1222
+ help="workers (int, default = 0) - Number thread used in of dataloader.")
1223
+ t0_allowlist = st.text_input('allowlist', value="", max_chars=None, \
1224
+ placeholder='Force EasyOCR to recognize only this subset of characters', \
1225
+ help='''allowlist (string) - Force EasyOCR to recognize only subset of characters.\n
1226
+ Usefor specific problem (E.g. license plate, etc.)''')
1227
+ t0_blocklist = st.text_input('blocklist', value="", max_chars=None, \
1228
+ placeholder='Block subset of character (will be ignored if allowlist is given)', \
1229
+ help='''blocklist (string) - Block subset of character. This argument will be \
1230
+ ignored if allowlist is given.''')
1231
+ t0_detail = st.radio('detail', [0, 1], 1, horizontal=True, \
1232
+ help="detail (int, default = 1) - Set this to 0 for simple output")
1233
+ t0_paragraph = st.radio('paragraph', [True, False], 1, horizontal=True, \
1234
+ help='paragraph (bool, default = False) - Combine result into paragraph')
1235
+ t0_contrast_ths = st.slider('contrast_ths', 0.05, 1., 0.1, step=0.01, \
1236
+ help='''contrast_ths (float, default = 0.1) - Text box with contrast lower than \
1237
+ this value will be passed into model 2 times.\n
1238
+ Firs with original image and second with contrast adjusted to 'adjust_contrast' value.\n
1239
+ The with more confident level will be returned as a result.''')
1240
+ t0_adjust_contrast = st.slider('adjust_contrast', 0.1, 1., 0.5, step=0.1, \
1241
+ help = 'adjust_contrast (float, default = 0.5) - target contrast level for low \
1242
+ contrast text box')
1243
+
1244
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[1], \
1245
+ expanded=False):
1246
+ t1_rec_algorithm = st.selectbox('rec_algorithm', ['CRNN', 'SVTR_LCNet'], 0, \
1247
+ help="Type of recognition algorithm selected. (default=CRNN)")
1248
+ t1_rec_batch_num = st.slider('rec_batch_num', 1, 50, step=1, \
1249
+ help="When performing recognition, the batchsize of forward images. \
1250
+ (default=30)")
1251
+ t1_max_text_length = st.slider('max_text_length', 3, 250, 25, step=1, \
1252
+ help="The maximum text length that the recognition algorithm can recognize. \
1253
+ (default=25)")
1254
+ t1_use_space_char = st.radio('use_space_char', [True, False], 0, horizontal=True, \
1255
+ help="Whether to recognize spaces. (default=TRUE)")
1256
+ t1_drop_score = st.slider('drop_score', 0., 1., 0.25, step=.05, \
1257
+ help="Filter the output by score (from the recognition model), and those \
1258
+ below this score will not be returned. (default=0.5)")
1259
+
1260
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[2], \
1261
+ expanded=False):
1262
+ t2_recog = st.selectbox('recog', ['ABINet','CRNN','CRNN_TPS','MASTER', \
1263
+ 'NRTR_1/16-1/8','NRTR_1/8-1/4','RobustScanner','SAR','SAR_CN', \
1264
+ 'SATRN','SATRN_sm','SEG','Tesseract'], 7, \
1265
+ help='Text recognition algorithm. (default = SAR)')
1266
+ st.write("###### *More about text recognition models* 👉 \
1267
+ [here](https://mmocr.readthedocs.io/en/latest/textrecog_models.html)")
1268
+
1269
+ with st.expander("Choose recognition hyperparameters for " + reader_type_list[3], \
1270
+ expanded=False):
1271
+ t3r_psm = st.selectbox('Page segmentation mode (psm)', \
1272
+ [' - Default', \
1273
+ ' 4 Assume a single column of text of variable sizes', \
1274
+ ' 5 Assume a single uniform block of vertically aligned \
1275
+ text', \
1276
+ ' 6 Assume a single uniform block of text', \
1277
+ ' 7 Treat the image as a single text line', \
1278
+ ' 8 Treat the image as a single word', \
1279
+ ' 9 Treat the image as a single word in a circle', \
1280
+ '10 Treat the image as a single character', \
1281
+ '11 Sparse text. Find as much text as possible in no \
1282
+ particular order', \
1283
+ '13 Raw line. Treat the image as a single text line, \
1284
+ bypassing hacks that are Tesseract-specific'])
1285
+ t3r_oem = st.selectbox('OCR engine mode', ['0 Legacy engine only', \
1286
+ '1 Neural nets LSTM engine only', \
1287
+ '2 Legacy + LSTM engines', \
1288
+ '3 Default, based on what is available'], 3)
1289
+ t3r_whitelist = st.text_input('Limit tesseract to recognize only this \
1290
+ characters :', \
1291
+ placeholder='Limit tesseract to recognize only this characters', \
1292
+ help='Example for numbers only : 0123456789')
1293
+
1294
+ submit_reco = st.form_submit_button("Launch recognition")
1295
+
1296
+ if submit_reco:
1297
+ process_detect.clear()
1298
+ ##----------- Process recognition ------------------------------------------
1299
+ reader_ind = reader_type_dict[st.session_state.detect_reader]
1300
+ list_boxes = list_coordinates[reader_ind]
1301
+
1302
+ # Construct the config Tesseract parameter
1303
+ t3r_config = ''
1304
+ psm = t3r_psm[:2]
1305
+ if psm != ' -':
1306
+ t3r_config += '--psm ' + psm.strip()
1307
+ oem = t3r_oem[:1]
1308
+ if oem != '3':
1309
+ t3r_config += ' --oem ' + oem
1310
+ if t3r_whitelist != '':
1311
+ t3r_config += ' -c tessedit_char_whitelist=' + t3r_whitelist
1312
+
1313
+ list_params_rec = \
1314
+ [{'decoder': t0_decoder, 'beamWidth': t0_beamWidth, \
1315
+ 'batch_size': t0_batch_size, 'workers': t0_workers, \
1316
+ 'allowlist': t0_allowlist, 'blocklist': t0_blocklist, \
1317
+ 'detail': t0_detail, 'paragraph': t0_paragraph, \
1318
+ 'contrast_ths': t0_contrast_ths, 'adjust_contrast': t0_adjust_contrast
1319
+ },
1320
+ { **list_params_det[1][1], **{'rec_algorithm': t1_rec_algorithm, \
1321
+ 'rec_batch_num': t1_rec_batch_num, 'max_text_length': t1_max_text_length, \
1322
+ 'use_space_char': t1_use_space_char, 'drop_score': t1_drop_score}, \
1323
+ **{'lang': list_params_det[1][0]}
1324
+ },
1325
+ {'recog': t2_recog},
1326
+ {'lang': tesserocr_lang, 'config': t3r_config}
1327
+ ]
1328
+
1329
+ show_info2 = st.empty()
1330
+
1331
+ with show_info2.container():
1332
+ st.info("Text recognition in progress ...")
1333
+ df_results, df_results_tesseract, list_reco_status = \
1334
+ process_recog(list_readers, list_images[1], list_boxes, list_params_rec)
1335
+ show_info2.empty()
1336
+
1337
+ st.session_state.df_results = df_results
1338
+ st.session_state.list_boxes = list_boxes
1339
+ st.session_state.df_results_tesseract = df_results_tesseract
1340
+ st.session_state.list_reco_status = list_reco_status
1341
+
1342
+ if 'df_results' in st.session_state:
1343
+ if not st.session_state.df_results.empty:
1344
+ ##----------- Show recognition results ------------------------------------------------------------
1345
+ results_cols = st.session_state.df_results.columns
1346
+ list_col_text = np.arange(1, len(cols_size), 2)
1347
+ list_col_confid = np.arange(2, len(cols_size), 2)
1348
+
1349
+ dict_draw_reco = {'in_image': st.session_state.list_images[1], \
1350
+ 'in_boxes_coordinates': st.session_state.list_boxes, \
1351
+ 'in_list_texts': [st.session_state.df_results[x].to_list() \
1352
+ for x in results_cols[list_col_text]], \
1353
+ 'in_list_confid': [st.session_state.df_results[x].to_list() \
1354
+ for x in results_cols[list_col_confid]], \
1355
+ 'in_dict_back_colors': dict_back_colors, \
1356
+ 'in_df_results_tesseract' : st.session_state.df_results_tesseract, \
1357
+ 'in_reader_type_list': reader_type_list
1358
+ }
1359
+ show_reco = st.empty()
1360
+
1361
+ with st.form("form3"):
1362
+ st.plotly_chart(fig_colorscale, use_container_width=True)
1363
+
1364
+ col_font, col_threshold = st.columns(2)
1365
+
1366
+ col_font.slider('Font scale', 1, 7, 1, step=1, key="font_scale_sld")
1367
+ col_threshold.slider('% confidence threshold for text color change', 40, 100, 64, \
1368
+ step=1, key="conf_threshold_sld")
1369
+ col_threshold.write("(text color is black below this % confidence threshold, \
1370
+ and white above)")
1371
+
1372
+ draw_reco_images(**dict_draw_reco)
1373
+
1374
+ submit_resize = st.form_submit_button("Refresh")
1375
+
1376
+ if submit_resize:
1377
+ draw_reco_images(**dict_draw_reco, \
1378
+ in_font_scale=st.session_state.font_scale_sld, \
1379
+ in_conf_threshold=st.session_state.conf_threshold_sld)
1380
+
1381
+ st.subheader("Recognition details")
1382
+ with st.expander("Detailed areas for EasyOCR, PPOCR, MMOCR", expanded=True):
1383
+ cols = st.columns(cols_size)
1384
+ cols[0].markdown('#### Detected area')
1385
+ for i in range(1, (len(reader_type_list)-1)*2, 2):
1386
+ cols[i].markdown('#### with ' + reader_type_list[i//2])
1387
+
1388
+ for row in st.session_state.df_results.itertuples():
1389
+ #cols = st.columns(1 + len(reader_type_list)*2)
1390
+ cols = st.columns(cols_size)
1391
+ cols[0].image(row.cropped_image, width=150)
1392
+ for ind_col in range(1, len(cols), 2):
1393
+ cols[ind_col].write(getattr(row, results_cols[ind_col]))
1394
+ cols[ind_col+1].write("("+str( \
1395
+ getattr(row, results_cols[ind_col+1]))+"%)")
1396
+
1397
+ st.download_button(
1398
+ label="Download results as CSV file",
1399
+ data=convert_df(st.session_state.df_results),
1400
+ file_name='OCR_comparator_results.csv',
1401
+ mime='text/csv',
1402
+ )
1403
+
1404
+ if not st.session_state.df_results_tesseract.empty:
1405
+ with st.expander("Detailed areas for Tesseract", expanded=False):
1406
+ cols = st.columns([2,2,1])
1407
+ cols[0].markdown('#### Detected area')
1408
+ cols[1].markdown('#### with Tesseract')
1409
+
1410
+ for row in st.session_state.df_results_tesseract.itertuples():
1411
+ cols = st.columns([2,2,1])
1412
+ cols[0].image(row.cropped, width=150)
1413
+ cols[1].write(getattr(row, 'text'))
1414
+ cols[2].write("("+str(getattr(row, 'conf'))+"%)")
1415
+
1416
+ st.download_button(
1417
+ label="Download Tesseract results as CSV file",
1418
+ data=convert_df(st.session_state.df_results),
1419
+ file_name='OCR_comparator_Tesseract_results.csv',
1420
+ mime='text/csv',
1421
+ )
configs/_base_/default_runtime.py ADDED
@@ -0,0 +1,17 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # yapf:disable
2
+ log_config = dict(
3
+ interval=5,
4
+ hooks=[
5
+ dict(type='TextLoggerHook')
6
+ ])
7
+ # yapf:enable
8
+ dist_params = dict(backend='nccl')
9
+ log_level = 'INFO'
10
+ load_from = None
11
+ resume_from = None
12
+ workflow = [('train', 1)]
13
+
14
+ # disable opencv multithreading to avoid system being overloaded
15
+ opencv_num_threads = 0
16
+ # set multi-process start method as `fork` to speed up the training
17
+ mp_start_method = 'fork'
configs/_base_/det_datasets/ctw1500.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/ctw1500'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_test.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/icdar2015.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/icdar2015'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_test.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/icdar2017.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'IcdarDataset'
2
+ data_root = 'data/icdar2017'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.json',
7
+ img_prefix=f'{data_root}/imgs',
8
+ pipeline=None)
9
+
10
+ test = dict(
11
+ type=dataset_type,
12
+ ann_file=f'{data_root}/instances_val.json',
13
+ img_prefix=f'{data_root}/imgs',
14
+ pipeline=None)
15
+
16
+ train_list = [train]
17
+
18
+ test_list = [test]
configs/_base_/det_datasets/synthtext.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'TextDetDataset'
2
+ data_root = 'data/synthtext'
3
+
4
+ train = dict(
5
+ type=dataset_type,
6
+ ann_file=f'{data_root}/instances_training.lmdb',
7
+ loader=dict(
8
+ type='AnnFileLoader',
9
+ repeat=1,
10
+ file_format='lmdb',
11
+ parser=dict(
12
+ type='LineJsonParser',
13
+ keys=['file_name', 'height', 'width', 'annotations'])),
14
+ img_prefix=f'{data_root}/imgs',
15
+ pipeline=None)
16
+
17
+ train_list = [train]
18
+ test_list = [train]
configs/_base_/det_datasets/toy_data.py ADDED
@@ -0,0 +1,41 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ root = 'tests/data/toy_dataset'
2
+
3
+ # dataset with type='TextDetDataset'
4
+ train1 = dict(
5
+ type='TextDetDataset',
6
+ img_prefix=f'{root}/imgs',
7
+ ann_file=f'{root}/instances_test.txt',
8
+ loader=dict(
9
+ type='AnnFileLoader',
10
+ repeat=4,
11
+ file_format='txt',
12
+ parser=dict(
13
+ type='LineJsonParser',
14
+ keys=['file_name', 'height', 'width', 'annotations'])),
15
+ pipeline=None,
16
+ test_mode=False)
17
+
18
+ # dataset with type='IcdarDataset'
19
+ train2 = dict(
20
+ type='IcdarDataset',
21
+ ann_file=f'{root}/instances_test.json',
22
+ img_prefix=f'{root}/imgs',
23
+ pipeline=None)
24
+
25
+ test = dict(
26
+ type='TextDetDataset',
27
+ img_prefix=f'{root}/imgs',
28
+ ann_file=f'{root}/instances_test.txt',
29
+ loader=dict(
30
+ type='AnnFileLoader',
31
+ repeat=1,
32
+ file_format='txt',
33
+ parser=dict(
34
+ type='LineJsonParser',
35
+ keys=['file_name', 'height', 'width', 'annotations'])),
36
+ pipeline=None,
37
+ test_mode=True)
38
+
39
+ train_list = [train1, train2]
40
+
41
+ test_list = [test]
configs/_base_/det_models/dbnet_r18_fpnc.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=18,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
11
+ norm_eval=False,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPNC', in_channels=[64, 128, 256, 512], lateral_channels=256),
15
+ bbox_head=dict(
16
+ type='DBHead',
17
+ in_channels=256,
18
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
19
+ postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
20
+ train_cfg=None,
21
+ test_cfg=None)
configs/_base_/det_models/dbnet_r50dcnv2_fpnc.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=False,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='FPNC', in_channels=[256, 512, 1024, 2048], lateral_channels=256),
17
+ bbox_head=dict(
18
+ type='DBHead',
19
+ in_channels=256,
20
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
21
+ postprocessor=dict(type='DBPostprocessor', text_repr_type='quad')),
22
+ train_cfg=None,
23
+ test_cfg=None)
configs/_base_/det_models/dbnetpp_r50dcnv2_fpnc.py ADDED
@@ -0,0 +1,28 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DBNet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=False,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=1, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='FPNC',
17
+ in_channels=[256, 512, 1024, 2048],
18
+ lateral_channels=256,
19
+ asf_cfg=dict(attention_type='ScaleChannelSpatial')),
20
+ bbox_head=dict(
21
+ type='DBHead',
22
+ in_channels=256,
23
+ loss=dict(type='DBLoss', alpha=5.0, beta=10.0, bbce_loss=True),
24
+ postprocessor=dict(
25
+ type='DBPostprocessor', text_repr_type='quad',
26
+ epsilon_ratio=0.002)),
27
+ train_cfg=None,
28
+ test_cfg=None)
configs/_base_/det_models/drrg_r50_fpn_unet.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='DRRG',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
15
+ bbox_head=dict(
16
+ type='DRRGHead',
17
+ in_channels=32,
18
+ text_region_thr=0.3,
19
+ center_region_thr=0.4,
20
+ loss=dict(type='DRRGLoss'),
21
+ postprocessor=dict(type='DRRGPostprocessor', link_thr=0.80)))
configs/_base_/det_models/fcenet_r50_fpn.py ADDED
@@ -0,0 +1,33 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='FCENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=False,
12
+ style='pytorch'),
13
+ neck=dict(
14
+ type='mmdet.FPN',
15
+ in_channels=[512, 1024, 2048],
16
+ out_channels=256,
17
+ add_extra_convs='on_output',
18
+ num_outs=3,
19
+ relu_before_extra_convs=True,
20
+ act_cfg=None),
21
+ bbox_head=dict(
22
+ type='FCEHead',
23
+ in_channels=256,
24
+ scales=(8, 16, 32),
25
+ fourier_degree=5,
26
+ loss=dict(type='FCELoss', num_sample=50),
27
+ postprocessor=dict(
28
+ type='FCEPostprocessor',
29
+ text_repr_type='quad',
30
+ num_reconstr_points=50,
31
+ alpha=1.2,
32
+ beta=1.0,
33
+ score_thr=0.3)))
configs/_base_/det_models/fcenet_r50dcnv2_fpn.py ADDED
@@ -0,0 +1,35 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='FCENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ norm_eval=True,
11
+ style='pytorch',
12
+ dcn=dict(type='DCNv2', deform_groups=2, fallback_on_stride=False),
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ stage_with_dcn=(False, True, True, True)),
15
+ neck=dict(
16
+ type='mmdet.FPN',
17
+ in_channels=[512, 1024, 2048],
18
+ out_channels=256,
19
+ add_extra_convs='on_output',
20
+ num_outs=3,
21
+ relu_before_extra_convs=True,
22
+ act_cfg=None),
23
+ bbox_head=dict(
24
+ type='FCEHead',
25
+ in_channels=256,
26
+ scales=(8, 16, 32),
27
+ fourier_degree=5,
28
+ loss=dict(type='FCELoss', num_sample=50),
29
+ postprocessor=dict(
30
+ type='FCEPostprocessor',
31
+ text_repr_type='poly',
32
+ num_reconstr_points=50,
33
+ alpha=1.0,
34
+ beta=2.0,
35
+ score_thr=0.3)))
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='OCRMaskRCNN',
4
+ backbone=dict(
5
+ type='mmdet.ResNet',
6
+ depth=50,
7
+ num_stages=4,
8
+ out_indices=(0, 1, 2, 3),
9
+ frozen_stages=1,
10
+ norm_cfg=dict(type='BN', requires_grad=True),
11
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
12
+ norm_eval=True,
13
+ style='pytorch'),
14
+ neck=dict(
15
+ type='mmdet.FPN',
16
+ in_channels=[256, 512, 1024, 2048],
17
+ out_channels=256,
18
+ num_outs=5),
19
+ rpn_head=dict(
20
+ type='RPNHead',
21
+ in_channels=256,
22
+ feat_channels=256,
23
+ anchor_generator=dict(
24
+ type='AnchorGenerator',
25
+ scales=[4],
26
+ ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
27
+ strides=[4, 8, 16, 32, 64]),
28
+ bbox_coder=dict(
29
+ type='DeltaXYWHBBoxCoder',
30
+ target_means=[.0, .0, .0, .0],
31
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
32
+ loss_cls=dict(
33
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
34
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
35
+ roi_head=dict(
36
+ type='StandardRoIHead',
37
+ bbox_roi_extractor=dict(
38
+ type='SingleRoIExtractor',
39
+ roi_layer=dict(type='RoIAlign', output_size=7, sampling_ratio=0),
40
+ out_channels=256,
41
+ featmap_strides=[4, 8, 16, 32]),
42
+ bbox_head=dict(
43
+ type='Shared2FCBBoxHead',
44
+ in_channels=256,
45
+ fc_out_channels=1024,
46
+ roi_feat_size=7,
47
+ num_classes=1,
48
+ bbox_coder=dict(
49
+ type='DeltaXYWHBBoxCoder',
50
+ target_means=[0., 0., 0., 0.],
51
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
52
+ reg_class_agnostic=False,
53
+ loss_cls=dict(
54
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
55
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
56
+ mask_roi_extractor=dict(
57
+ type='SingleRoIExtractor',
58
+ roi_layer=dict(type='RoIAlign', output_size=14, sampling_ratio=0),
59
+ out_channels=256,
60
+ featmap_strides=[4, 8, 16, 32]),
61
+ mask_head=dict(
62
+ type='FCNMaskHead',
63
+ num_convs=4,
64
+ in_channels=256,
65
+ conv_out_channels=256,
66
+ num_classes=1,
67
+ loss_mask=dict(
68
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
69
+
70
+ # model training and testing settings
71
+ train_cfg=dict(
72
+ rpn=dict(
73
+ assigner=dict(
74
+ type='MaxIoUAssigner',
75
+ pos_iou_thr=0.7,
76
+ neg_iou_thr=0.3,
77
+ min_pos_iou=0.3,
78
+ match_low_quality=True,
79
+ ignore_iof_thr=-1,
80
+ gpu_assign_thr=50),
81
+ sampler=dict(
82
+ type='RandomSampler',
83
+ num=256,
84
+ pos_fraction=0.5,
85
+ neg_pos_ub=-1,
86
+ add_gt_as_proposals=False),
87
+ allowed_border=-1,
88
+ pos_weight=-1,
89
+ debug=False),
90
+ rpn_proposal=dict(
91
+ nms_across_levels=False,
92
+ nms_pre=2000,
93
+ nms_post=1000,
94
+ max_per_img=1000,
95
+ nms=dict(type='nms', iou_threshold=0.7),
96
+ min_bbox_size=0),
97
+ rcnn=dict(
98
+ assigner=dict(
99
+ type='MaxIoUAssigner',
100
+ pos_iou_thr=0.5,
101
+ neg_iou_thr=0.5,
102
+ min_pos_iou=0.5,
103
+ match_low_quality=True,
104
+ ignore_iof_thr=-1),
105
+ sampler=dict(
106
+ type='OHEMSampler',
107
+ num=512,
108
+ pos_fraction=0.25,
109
+ neg_pos_ub=-1,
110
+ add_gt_as_proposals=True),
111
+ mask_size=28,
112
+ pos_weight=-1,
113
+ debug=False)),
114
+ test_cfg=dict(
115
+ rpn=dict(
116
+ nms_across_levels=False,
117
+ nms_pre=1000,
118
+ nms_post=1000,
119
+ max_per_img=1000,
120
+ nms=dict(type='nms', iou_threshold=0.7),
121
+ min_bbox_size=0),
122
+ rcnn=dict(
123
+ score_thr=0.05,
124
+ nms=dict(type='nms', iou_threshold=0.5),
125
+ max_per_img=100,
126
+ mask_thr_binary=0.5)))
configs/_base_/det_models/ocr_mask_rcnn_r50_fpn_ohem_poly.py ADDED
@@ -0,0 +1,126 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model settings
2
+ model = dict(
3
+ type='OCRMaskRCNN',
4
+ text_repr_type='poly',
5
+ backbone=dict(
6
+ type='mmdet.ResNet',
7
+ depth=50,
8
+ num_stages=4,
9
+ out_indices=(0, 1, 2, 3),
10
+ frozen_stages=1,
11
+ norm_cfg=dict(type='BN', requires_grad=True),
12
+ norm_eval=True,
13
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
14
+ style='pytorch'),
15
+ neck=dict(
16
+ type='mmdet.FPN',
17
+ in_channels=[256, 512, 1024, 2048],
18
+ out_channels=256,
19
+ num_outs=5),
20
+ rpn_head=dict(
21
+ type='RPNHead',
22
+ in_channels=256,
23
+ feat_channels=256,
24
+ anchor_generator=dict(
25
+ type='AnchorGenerator',
26
+ scales=[4],
27
+ ratios=[0.17, 0.44, 1.13, 2.90, 7.46],
28
+ strides=[4, 8, 16, 32, 64]),
29
+ bbox_coder=dict(
30
+ type='DeltaXYWHBBoxCoder',
31
+ target_means=[.0, .0, .0, .0],
32
+ target_stds=[1.0, 1.0, 1.0, 1.0]),
33
+ loss_cls=dict(
34
+ type='CrossEntropyLoss', use_sigmoid=True, loss_weight=1.0),
35
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
36
+ roi_head=dict(
37
+ type='StandardRoIHead',
38
+ bbox_roi_extractor=dict(
39
+ type='SingleRoIExtractor',
40
+ roi_layer=dict(type='RoIAlign', output_size=7, sample_num=0),
41
+ out_channels=256,
42
+ featmap_strides=[4, 8, 16, 32]),
43
+ bbox_head=dict(
44
+ type='Shared2FCBBoxHead',
45
+ in_channels=256,
46
+ fc_out_channels=1024,
47
+ roi_feat_size=7,
48
+ num_classes=80,
49
+ bbox_coder=dict(
50
+ type='DeltaXYWHBBoxCoder',
51
+ target_means=[0., 0., 0., 0.],
52
+ target_stds=[0.1, 0.1, 0.2, 0.2]),
53
+ reg_class_agnostic=False,
54
+ loss_cls=dict(
55
+ type='CrossEntropyLoss', use_sigmoid=False, loss_weight=1.0),
56
+ loss_bbox=dict(type='L1Loss', loss_weight=1.0)),
57
+ mask_roi_extractor=dict(
58
+ type='SingleRoIExtractor',
59
+ roi_layer=dict(type='RoIAlign', output_size=14, sample_num=0),
60
+ out_channels=256,
61
+ featmap_strides=[4, 8, 16, 32]),
62
+ mask_head=dict(
63
+ type='FCNMaskHead',
64
+ num_convs=4,
65
+ in_channels=256,
66
+ conv_out_channels=256,
67
+ num_classes=80,
68
+ loss_mask=dict(
69
+ type='CrossEntropyLoss', use_mask=True, loss_weight=1.0))),
70
+ # model training and testing settings
71
+ train_cfg=dict(
72
+ rpn=dict(
73
+ assigner=dict(
74
+ type='MaxIoUAssigner',
75
+ pos_iou_thr=0.7,
76
+ neg_iou_thr=0.3,
77
+ min_pos_iou=0.3,
78
+ match_low_quality=True,
79
+ ignore_iof_thr=-1),
80
+ sampler=dict(
81
+ type='RandomSampler',
82
+ num=256,
83
+ pos_fraction=0.5,
84
+ neg_pos_ub=-1,
85
+ add_gt_as_proposals=False),
86
+ allowed_border=-1,
87
+ pos_weight=-1,
88
+ debug=False),
89
+ rpn_proposal=dict(
90
+ nms_across_levels=False,
91
+ nms_pre=2000,
92
+ nms_post=1000,
93
+ max_per_img=1000,
94
+ nms=dict(type='nms', iou_threshold=0.7),
95
+ min_bbox_size=0),
96
+ rcnn=dict(
97
+ assigner=dict(
98
+ type='MaxIoUAssigner',
99
+ pos_iou_thr=0.5,
100
+ neg_iou_thr=0.5,
101
+ min_pos_iou=0.5,
102
+ match_low_quality=True,
103
+ ignore_iof_thr=-1,
104
+ gpu_assign_thr=50),
105
+ sampler=dict(
106
+ type='OHEMSampler',
107
+ num=512,
108
+ pos_fraction=0.25,
109
+ neg_pos_ub=-1,
110
+ add_gt_as_proposals=True),
111
+ mask_size=28,
112
+ pos_weight=-1,
113
+ debug=False)),
114
+ test_cfg=dict(
115
+ rpn=dict(
116
+ nms_across_levels=False,
117
+ nms_pre=1000,
118
+ nms_post=1000,
119
+ max_per_img=1000,
120
+ nms=dict(type='nms', iou_threshold=0.7),
121
+ min_bbox_size=0),
122
+ rcnn=dict(
123
+ score_thr=0.05,
124
+ nms=dict(type='nms', iou_threshold=0.5),
125
+ max_per_img=100,
126
+ mask_thr_binary=0.5)))
configs/_base_/det_models/panet_r18_fpem_ffm.py ADDED
@@ -0,0 +1,43 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_poly = dict(
2
+ type='PANet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=18,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
14
+ bbox_head=dict(
15
+ type='PANHead',
16
+ in_channels=[128, 128, 128, 128],
17
+ out_channels=6,
18
+ loss=dict(type='PANLoss'),
19
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
20
+ train_cfg=None,
21
+ test_cfg=None)
22
+
23
+ model_quad = dict(
24
+ type='PANet',
25
+ backbone=dict(
26
+ type='mmdet.ResNet',
27
+ depth=18,
28
+ num_stages=4,
29
+ out_indices=(0, 1, 2, 3),
30
+ frozen_stages=-1,
31
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
32
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet18'),
33
+ norm_eval=True,
34
+ style='caffe'),
35
+ neck=dict(type='FPEM_FFM', in_channels=[64, 128, 256, 512]),
36
+ bbox_head=dict(
37
+ type='PANHead',
38
+ in_channels=[128, 128, 128, 128],
39
+ out_channels=6,
40
+ loss=dict(type='PANLoss'),
41
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='quad')),
42
+ train_cfg=None,
43
+ test_cfg=None)
configs/_base_/det_models/panet_r50_fpem_ffm.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='PANet',
3
+ pretrained='torchvision://resnet50',
4
+ backbone=dict(
5
+ type='mmdet.ResNet',
6
+ depth=50,
7
+ num_stages=4,
8
+ out_indices=(0, 1, 2, 3),
9
+ frozen_stages=1,
10
+ norm_cfg=dict(type='BN', requires_grad=True),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(type='FPEM_FFM', in_channels=[256, 512, 1024, 2048]),
14
+ bbox_head=dict(
15
+ type='PANHead',
16
+ in_channels=[128, 128, 128, 128],
17
+ out_channels=6,
18
+ loss=dict(type='PANLoss', speedup_bbox_thr=32),
19
+ postprocessor=dict(type='PANPostprocessor', text_repr_type='poly')),
20
+ train_cfg=None,
21
+ test_cfg=None)
configs/_base_/det_models/psenet_r50_fpnf.py ADDED
@@ -0,0 +1,51 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model_poly = dict(
2
+ type='PSENet',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPNF',
15
+ in_channels=[256, 512, 1024, 2048],
16
+ out_channels=256,
17
+ fusion_type='concat'),
18
+ bbox_head=dict(
19
+ type='PSEHead',
20
+ in_channels=[256],
21
+ out_channels=7,
22
+ loss=dict(type='PSELoss'),
23
+ postprocessor=dict(type='PSEPostprocessor', text_repr_type='poly')),
24
+ train_cfg=None,
25
+ test_cfg=None)
26
+
27
+ model_quad = dict(
28
+ type='PSENet',
29
+ backbone=dict(
30
+ type='mmdet.ResNet',
31
+ depth=50,
32
+ num_stages=4,
33
+ out_indices=(0, 1, 2, 3),
34
+ frozen_stages=-1,
35
+ norm_cfg=dict(type='SyncBN', requires_grad=True),
36
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
37
+ norm_eval=True,
38
+ style='caffe'),
39
+ neck=dict(
40
+ type='FPNF',
41
+ in_channels=[256, 512, 1024, 2048],
42
+ out_channels=256,
43
+ fusion_type='concat'),
44
+ bbox_head=dict(
45
+ type='PSEHead',
46
+ in_channels=[256],
47
+ out_channels=7,
48
+ loss=dict(type='PSELoss'),
49
+ postprocessor=dict(type='PSEPostprocessor', text_repr_type='quad')),
50
+ train_cfg=None,
51
+ test_cfg=None)
configs/_base_/det_models/textsnake_r50_fpn_unet.py ADDED
@@ -0,0 +1,22 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ model = dict(
2
+ type='TextSnake',
3
+ backbone=dict(
4
+ type='mmdet.ResNet',
5
+ depth=50,
6
+ num_stages=4,
7
+ out_indices=(0, 1, 2, 3),
8
+ frozen_stages=-1,
9
+ norm_cfg=dict(type='BN', requires_grad=True),
10
+ init_cfg=dict(type='Pretrained', checkpoint='torchvision://resnet50'),
11
+ norm_eval=True,
12
+ style='caffe'),
13
+ neck=dict(
14
+ type='FPN_UNet', in_channels=[256, 512, 1024, 2048], out_channels=32),
15
+ bbox_head=dict(
16
+ type='TextSnakeHead',
17
+ in_channels=32,
18
+ loss=dict(type='TextSnakeLoss'),
19
+ postprocessor=dict(
20
+ type='TextSnakePostprocessor', text_repr_type='poly')),
21
+ train_cfg=None,
22
+ test_cfg=None)
configs/_base_/det_pipelines/dbnet_pipeline.py ADDED
@@ -0,0 +1,88 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline_r18 = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='ImgAug',
15
+ args=[['Fliplr', 0.5],
16
+ dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
17
+ dict(type='EastRandomCrop', target_size=(640, 640)),
18
+ dict(type='DBNetTargets', shrink_ratio=0.4),
19
+ dict(type='Pad', size_divisor=32),
20
+ dict(
21
+ type='CustomFormatBundle',
22
+ keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
23
+ visualize=dict(flag=False, boundary_key='gt_shrink')),
24
+ dict(
25
+ type='Collect',
26
+ keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
27
+ ]
28
+
29
+ test_pipeline_1333_736 = [
30
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
31
+ dict(
32
+ type='MultiScaleFlipAug',
33
+ img_scale=(1333, 736), # used by Resize
34
+ flip=False,
35
+ transforms=[
36
+ dict(type='Resize', keep_ratio=True),
37
+ dict(type='Normalize', **img_norm_cfg),
38
+ dict(type='Pad', size_divisor=32),
39
+ dict(type='ImageToTensor', keys=['img']),
40
+ dict(type='Collect', keys=['img']),
41
+ ])
42
+ ]
43
+
44
+ # for dbnet_r50dcnv2_fpnc
45
+ img_norm_cfg_r50dcnv2 = dict(
46
+ mean=[122.67891434, 116.66876762, 104.00698793],
47
+ std=[58.395, 57.12, 57.375],
48
+ to_rgb=True)
49
+
50
+ train_pipeline_r50dcnv2 = [
51
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
52
+ dict(
53
+ type='LoadTextAnnotations',
54
+ with_bbox=True,
55
+ with_mask=True,
56
+ poly2mask=False),
57
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
58
+ dict(type='Normalize', **img_norm_cfg_r50dcnv2),
59
+ dict(
60
+ type='ImgAug',
61
+ args=[['Fliplr', 0.5],
62
+ dict(cls='Affine', rotate=[-10, 10]), ['Resize', [0.5, 3.0]]]),
63
+ dict(type='EastRandomCrop', target_size=(640, 640)),
64
+ dict(type='DBNetTargets', shrink_ratio=0.4),
65
+ dict(type='Pad', size_divisor=32),
66
+ dict(
67
+ type='CustomFormatBundle',
68
+ keys=['gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'],
69
+ visualize=dict(flag=False, boundary_key='gt_shrink')),
70
+ dict(
71
+ type='Collect',
72
+ keys=['img', 'gt_shrink', 'gt_shrink_mask', 'gt_thr', 'gt_thr_mask'])
73
+ ]
74
+
75
+ test_pipeline_4068_1024 = [
76
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
77
+ dict(
78
+ type='MultiScaleFlipAug',
79
+ img_scale=(4068, 1024), # used by Resize
80
+ flip=False,
81
+ transforms=[
82
+ dict(type='Resize', keep_ratio=True),
83
+ dict(type='Normalize', **img_norm_cfg_r50dcnv2),
84
+ dict(type='Pad', size_divisor=32),
85
+ dict(type='ImageToTensor', keys=['img']),
86
+ dict(type='Collect', keys=['img']),
87
+ ])
88
+ ]
configs/_base_/det_pipelines/drrg_pipeline.py ADDED
@@ -0,0 +1,60 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(type='RandomScaling', size=800, scale=(0.75, 2.5)),
14
+ dict(
15
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
16
+ dict(
17
+ type='RandomCropPolyInstances',
18
+ instance_key='gt_masks',
19
+ crop_ratio=0.8,
20
+ min_side_ratio=0.3),
21
+ dict(
22
+ type='RandomRotatePolyInstances',
23
+ rotate_ratio=0.5,
24
+ max_angle=60,
25
+ pad_with_fixed_color=False),
26
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
27
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
28
+ dict(type='DRRGTargets'),
29
+ dict(type='Pad', size_divisor=32),
30
+ dict(
31
+ type='CustomFormatBundle',
32
+ keys=[
33
+ 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
34
+ 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
35
+ 'gt_cos_map', 'gt_comp_attribs'
36
+ ],
37
+ visualize=dict(flag=False, boundary_key='gt_text_mask')),
38
+ dict(
39
+ type='Collect',
40
+ keys=[
41
+ 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
42
+ 'gt_top_height_map', 'gt_bot_height_map', 'gt_sin_map',
43
+ 'gt_cos_map', 'gt_comp_attribs'
44
+ ])
45
+ ]
46
+
47
+ test_pipeline = [
48
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
49
+ dict(
50
+ type='MultiScaleFlipAug',
51
+ img_scale=(1024, 640), # used by Resize
52
+ flip=False,
53
+ transforms=[
54
+ dict(type='Resize', keep_ratio=True),
55
+ dict(type='Normalize', **img_norm_cfg),
56
+ dict(type='Pad', size_divisor=32),
57
+ dict(type='ImageToTensor', keys=['img']),
58
+ dict(type='Collect', keys=['img']),
59
+ ])
60
+ ]
configs/_base_/det_pipelines/fcenet_pipeline.py ADDED
@@ -0,0 +1,118 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ # for icdar2015
5
+ leval_prop_range_icdar2015 = ((0, 0.4), (0.3, 0.7), (0.6, 1.0))
6
+ train_pipeline_icdar2015 = [
7
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
8
+ dict(
9
+ type='LoadTextAnnotations',
10
+ with_bbox=True,
11
+ with_mask=True,
12
+ poly2mask=False),
13
+ dict(
14
+ type='ColorJitter',
15
+ brightness=32.0 / 255,
16
+ saturation=0.5,
17
+ contrast=0.5),
18
+ dict(type='Normalize', **img_norm_cfg),
19
+ dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
20
+ dict(
21
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
22
+ dict(
23
+ type='RandomCropPolyInstances',
24
+ instance_key='gt_masks',
25
+ crop_ratio=0.8,
26
+ min_side_ratio=0.3),
27
+ dict(
28
+ type='RandomRotatePolyInstances',
29
+ rotate_ratio=0.5,
30
+ max_angle=30,
31
+ pad_with_fixed_color=False),
32
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
33
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
34
+ dict(type='Pad', size_divisor=32),
35
+ dict(
36
+ type='FCENetTargets',
37
+ fourier_degree=5,
38
+ level_proportion_range=leval_prop_range_icdar2015),
39
+ dict(
40
+ type='CustomFormatBundle',
41
+ keys=['p3_maps', 'p4_maps', 'p5_maps'],
42
+ visualize=dict(flag=False, boundary_key=None)),
43
+ dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
44
+ ]
45
+
46
+ img_scale_icdar2015 = (2260, 2260)
47
+ test_pipeline_icdar2015 = [
48
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
49
+ dict(
50
+ type='MultiScaleFlipAug',
51
+ img_scale=img_scale_icdar2015, # used by Resize
52
+ flip=False,
53
+ transforms=[
54
+ dict(type='Resize', keep_ratio=True),
55
+ dict(type='Normalize', **img_norm_cfg),
56
+ dict(type='Pad', size_divisor=32),
57
+ dict(type='ImageToTensor', keys=['img']),
58
+ dict(type='Collect', keys=['img']),
59
+ ])
60
+ ]
61
+
62
+ # for ctw1500
63
+ leval_prop_range_ctw1500 = ((0, 0.25), (0.2, 0.65), (0.55, 1.0))
64
+ train_pipeline_ctw1500 = [
65
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
66
+ dict(
67
+ type='LoadTextAnnotations',
68
+ with_bbox=True,
69
+ with_mask=True,
70
+ poly2mask=False),
71
+ dict(
72
+ type='ColorJitter',
73
+ brightness=32.0 / 255,
74
+ saturation=0.5,
75
+ contrast=0.5),
76
+ dict(type='Normalize', **img_norm_cfg),
77
+ dict(type='RandomScaling', size=800, scale=(3. / 4, 5. / 2)),
78
+ dict(
79
+ type='RandomCropFlip', crop_ratio=0.5, iter_num=1, min_area_ratio=0.2),
80
+ dict(
81
+ type='RandomCropPolyInstances',
82
+ instance_key='gt_masks',
83
+ crop_ratio=0.8,
84
+ min_side_ratio=0.3),
85
+ dict(
86
+ type='RandomRotatePolyInstances',
87
+ rotate_ratio=0.5,
88
+ max_angle=30,
89
+ pad_with_fixed_color=False),
90
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
91
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
92
+ dict(type='Pad', size_divisor=32),
93
+ dict(
94
+ type='FCENetTargets',
95
+ fourier_degree=5,
96
+ level_proportion_range=leval_prop_range_ctw1500),
97
+ dict(
98
+ type='CustomFormatBundle',
99
+ keys=['p3_maps', 'p4_maps', 'p5_maps'],
100
+ visualize=dict(flag=False, boundary_key=None)),
101
+ dict(type='Collect', keys=['img', 'p3_maps', 'p4_maps', 'p5_maps'])
102
+ ]
103
+
104
+ img_scale_ctw1500 = (1080, 736)
105
+ test_pipeline_ctw1500 = [
106
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
107
+ dict(
108
+ type='MultiScaleFlipAug',
109
+ img_scale=img_scale_ctw1500, # used by Resize
110
+ flip=False,
111
+ transforms=[
112
+ dict(type='Resize', keep_ratio=True),
113
+ dict(type='Normalize', **img_norm_cfg),
114
+ dict(type='Pad', size_divisor=32),
115
+ dict(type='ImageToTensor', keys=['img']),
116
+ dict(type='Collect', keys=['img']),
117
+ ])
118
+ ]
configs/_base_/det_pipelines/maskrcnn_pipeline.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(type='LoadAnnotations', with_bbox=True, with_mask=True),
7
+ dict(
8
+ type='ScaleAspectJitter',
9
+ img_scale=None,
10
+ keep_ratio=False,
11
+ resize_type='indep_sample_in_range',
12
+ scale_range=(640, 2560)),
13
+ dict(type='RandomFlip', flip_ratio=0.5),
14
+ dict(type='Normalize', **img_norm_cfg),
15
+ dict(
16
+ type='RandomCropInstances',
17
+ target_size=(640, 640),
18
+ mask_type='union_all',
19
+ instance_key='gt_masks'),
20
+ dict(type='Pad', size_divisor=32),
21
+ dict(type='DefaultFormatBundle'),
22
+ dict(type='Collect', keys=['img', 'gt_bboxes', 'gt_labels', 'gt_masks']),
23
+ ]
24
+
25
+ # for ctw1500
26
+ img_scale_ctw1500 = (1600, 1600)
27
+ test_pipeline_ctw1500 = [
28
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
29
+ dict(
30
+ type='MultiScaleFlipAug',
31
+ img_scale=img_scale_ctw1500, # used by Resize
32
+ flip=False,
33
+ transforms=[
34
+ dict(type='Resize', keep_ratio=True),
35
+ dict(type='RandomFlip'),
36
+ dict(type='Normalize', **img_norm_cfg),
37
+ dict(type='ImageToTensor', keys=['img']),
38
+ dict(type='Collect', keys=['img']),
39
+ ])
40
+ ]
41
+
42
+ # for icdar2015
43
+ img_scale_icdar2015 = (1920, 1920)
44
+ test_pipeline_icdar2015 = [
45
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
46
+ dict(
47
+ type='MultiScaleFlipAug',
48
+ img_scale=img_scale_icdar2015, # used by Resize
49
+ flip=False,
50
+ transforms=[
51
+ dict(type='Resize', keep_ratio=True),
52
+ dict(type='RandomFlip'),
53
+ dict(type='Normalize', **img_norm_cfg),
54
+ dict(type='ImageToTensor', keys=['img']),
55
+ dict(type='Collect', keys=['img']),
56
+ ])
57
+ ]
configs/_base_/det_pipelines/panet_pipeline.py ADDED
@@ -0,0 +1,156 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ # for ctw1500
5
+ img_scale_train_ctw1500 = [(3000, 640)]
6
+ shrink_ratio_train_ctw1500 = (1.0, 0.7)
7
+ target_size_train_ctw1500 = (640, 640)
8
+ train_pipeline_ctw1500 = [
9
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
10
+ dict(
11
+ type='LoadTextAnnotations',
12
+ with_bbox=True,
13
+ with_mask=True,
14
+ poly2mask=False),
15
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
16
+ dict(type='Normalize', **img_norm_cfg),
17
+ dict(
18
+ type='ScaleAspectJitter',
19
+ img_scale=img_scale_train_ctw1500,
20
+ ratio_range=(0.7, 1.3),
21
+ aspect_ratio_range=(0.9, 1.1),
22
+ multiscale_mode='value',
23
+ keep_ratio=False),
24
+ # shrink_ratio is from big to small. The 1st must be 1.0
25
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_ctw1500),
26
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
27
+ dict(type='RandomRotateTextDet'),
28
+ dict(
29
+ type='RandomCropInstances',
30
+ target_size=target_size_train_ctw1500,
31
+ instance_key='gt_kernels'),
32
+ dict(type='Pad', size_divisor=32),
33
+ dict(
34
+ type='CustomFormatBundle',
35
+ keys=['gt_kernels', 'gt_mask'],
36
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
37
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
38
+ ]
39
+
40
+ img_scale_test_ctw1500 = (3000, 640)
41
+ test_pipeline_ctw1500 = [
42
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
43
+ dict(
44
+ type='MultiScaleFlipAug',
45
+ img_scale=img_scale_test_ctw1500, # used by Resize
46
+ flip=False,
47
+ transforms=[
48
+ dict(type='Resize', keep_ratio=True),
49
+ dict(type='Normalize', **img_norm_cfg),
50
+ dict(type='Pad', size_divisor=32),
51
+ dict(type='ImageToTensor', keys=['img']),
52
+ dict(type='Collect', keys=['img']),
53
+ ])
54
+ ]
55
+
56
+ # for icdar2015
57
+ img_scale_train_icdar2015 = [(3000, 736)]
58
+ shrink_ratio_train_icdar2015 = (1.0, 0.5)
59
+ target_size_train_icdar2015 = (736, 736)
60
+ train_pipeline_icdar2015 = [
61
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
62
+ dict(
63
+ type='LoadTextAnnotations',
64
+ with_bbox=True,
65
+ with_mask=True,
66
+ poly2mask=False),
67
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
68
+ dict(type='Normalize', **img_norm_cfg),
69
+ dict(
70
+ type='ScaleAspectJitter',
71
+ img_scale=img_scale_train_icdar2015,
72
+ ratio_range=(0.7, 1.3),
73
+ aspect_ratio_range=(0.9, 1.1),
74
+ multiscale_mode='value',
75
+ keep_ratio=False),
76
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2015),
77
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
78
+ dict(type='RandomRotateTextDet'),
79
+ dict(
80
+ type='RandomCropInstances',
81
+ target_size=target_size_train_icdar2015,
82
+ instance_key='gt_kernels'),
83
+ dict(type='Pad', size_divisor=32),
84
+ dict(
85
+ type='CustomFormatBundle',
86
+ keys=['gt_kernels', 'gt_mask'],
87
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
88
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
89
+ ]
90
+
91
+ img_scale_test_icdar2015 = (1333, 736)
92
+ test_pipeline_icdar2015 = [
93
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
94
+ dict(
95
+ type='MultiScaleFlipAug',
96
+ img_scale=img_scale_test_icdar2015, # used by Resize
97
+ flip=False,
98
+ transforms=[
99
+ dict(type='Resize', keep_ratio=True),
100
+ dict(type='Normalize', **img_norm_cfg),
101
+ dict(type='Pad', size_divisor=32),
102
+ dict(type='ImageToTensor', keys=['img']),
103
+ dict(type='Collect', keys=['img']),
104
+ ])
105
+ ]
106
+
107
+ # for icdar2017
108
+ img_scale_train_icdar2017 = [(3000, 800)]
109
+ shrink_ratio_train_icdar2017 = (1.0, 0.5)
110
+ target_size_train_icdar2017 = (800, 800)
111
+ train_pipeline_icdar2017 = [
112
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
113
+ dict(
114
+ type='LoadTextAnnotations',
115
+ with_bbox=True,
116
+ with_mask=True,
117
+ poly2mask=False),
118
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
119
+ dict(type='Normalize', **img_norm_cfg),
120
+ dict(
121
+ type='ScaleAspectJitter',
122
+ img_scale=img_scale_train_icdar2017,
123
+ ratio_range=(0.7, 1.3),
124
+ aspect_ratio_range=(0.9, 1.1),
125
+ multiscale_mode='value',
126
+ keep_ratio=False),
127
+ dict(type='PANetTargets', shrink_ratio=shrink_ratio_train_icdar2017),
128
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
129
+ dict(type='RandomRotateTextDet'),
130
+ dict(
131
+ type='RandomCropInstances',
132
+ target_size=target_size_train_icdar2017,
133
+ instance_key='gt_kernels'),
134
+ dict(type='Pad', size_divisor=32),
135
+ dict(
136
+ type='CustomFormatBundle',
137
+ keys=['gt_kernels', 'gt_mask'],
138
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
139
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
140
+ ]
141
+
142
+ img_scale_test_icdar2017 = (1333, 800)
143
+ test_pipeline_icdar2017 = [
144
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
145
+ dict(
146
+ type='MultiScaleFlipAug',
147
+ img_scale=img_scale_test_icdar2017, # used by Resize
148
+ flip=False,
149
+ transforms=[
150
+ dict(type='Resize', keep_ratio=True),
151
+ dict(type='Normalize', **img_norm_cfg),
152
+ dict(type='Pad', size_divisor=32),
153
+ dict(type='ImageToTensor', keys=['img']),
154
+ dict(type='Collect', keys=['img']),
155
+ ])
156
+ ]
configs/_base_/det_pipelines/psenet_pipeline.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='ScaleAspectJitter',
15
+ img_scale=[(3000, 736)],
16
+ ratio_range=(0.5, 3),
17
+ aspect_ratio_range=(1, 1),
18
+ multiscale_mode='value',
19
+ long_size_bound=1280,
20
+ short_size_bound=640,
21
+ resize_type='long_short_bound',
22
+ keep_ratio=False),
23
+ dict(type='PSENetTargets'),
24
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
25
+ dict(type='RandomRotateTextDet'),
26
+ dict(
27
+ type='RandomCropInstances',
28
+ target_size=(640, 640),
29
+ instance_key='gt_kernels'),
30
+ dict(type='Pad', size_divisor=32),
31
+ dict(
32
+ type='CustomFormatBundle',
33
+ keys=['gt_kernels', 'gt_mask'],
34
+ visualize=dict(flag=False, boundary_key='gt_kernels')),
35
+ dict(type='Collect', keys=['img', 'gt_kernels', 'gt_mask'])
36
+ ]
37
+
38
+ # for ctw1500
39
+ img_scale_test_ctw1500 = (1280, 1280)
40
+ test_pipeline_ctw1500 = [
41
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
42
+ dict(
43
+ type='MultiScaleFlipAug',
44
+ img_scale=img_scale_test_ctw1500, # used by Resize
45
+ flip=False,
46
+ transforms=[
47
+ dict(type='Resize', keep_ratio=True),
48
+ dict(type='Normalize', **img_norm_cfg),
49
+ dict(type='Pad', size_divisor=32),
50
+ dict(type='ImageToTensor', keys=['img']),
51
+ dict(type='Collect', keys=['img']),
52
+ ])
53
+ ]
54
+
55
+ # for icdar2015
56
+ img_scale_test_icdar2015 = (2240, 2240)
57
+ test_pipeline_icdar2015 = [
58
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
59
+ dict(
60
+ type='MultiScaleFlipAug',
61
+ img_scale=img_scale_test_icdar2015, # used by Resize
62
+ flip=False,
63
+ transforms=[
64
+ dict(type='Resize', keep_ratio=True),
65
+ dict(type='Normalize', **img_norm_cfg),
66
+ dict(type='Pad', size_divisor=32),
67
+ dict(type='ImageToTensor', keys=['img']),
68
+ dict(type='Collect', keys=['img']),
69
+ ])
70
+ ]
configs/_base_/det_pipelines/textsnake_pipeline.py ADDED
@@ -0,0 +1,65 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ img_norm_cfg = dict(
2
+ mean=[123.675, 116.28, 103.53], std=[58.395, 57.12, 57.375], to_rgb=True)
3
+
4
+ train_pipeline = [
5
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
6
+ dict(
7
+ type='LoadTextAnnotations',
8
+ with_bbox=True,
9
+ with_mask=True,
10
+ poly2mask=False),
11
+ dict(type='ColorJitter', brightness=32.0 / 255, saturation=0.5),
12
+ dict(type='Normalize', **img_norm_cfg),
13
+ dict(
14
+ type='RandomCropPolyInstances',
15
+ instance_key='gt_masks',
16
+ crop_ratio=0.65,
17
+ min_side_ratio=0.3),
18
+ dict(
19
+ type='RandomRotatePolyInstances',
20
+ rotate_ratio=0.5,
21
+ max_angle=20,
22
+ pad_with_fixed_color=False),
23
+ dict(
24
+ type='ScaleAspectJitter',
25
+ img_scale=[(3000, 736)], # unused
26
+ ratio_range=(0.7, 1.3),
27
+ aspect_ratio_range=(0.9, 1.1),
28
+ multiscale_mode='value',
29
+ long_size_bound=800,
30
+ short_size_bound=480,
31
+ resize_type='long_short_bound',
32
+ keep_ratio=False),
33
+ dict(type='SquareResizePad', target_size=800, pad_ratio=0.6),
34
+ dict(type='RandomFlip', flip_ratio=0.5, direction='horizontal'),
35
+ dict(type='TextSnakeTargets'),
36
+ dict(type='Pad', size_divisor=32),
37
+ dict(
38
+ type='CustomFormatBundle',
39
+ keys=[
40
+ 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
41
+ 'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
42
+ ],
43
+ visualize=dict(flag=False, boundary_key='gt_text_mask')),
44
+ dict(
45
+ type='Collect',
46
+ keys=[
47
+ 'img', 'gt_text_mask', 'gt_center_region_mask', 'gt_mask',
48
+ 'gt_radius_map', 'gt_sin_map', 'gt_cos_map'
49
+ ])
50
+ ]
51
+
52
+ test_pipeline = [
53
+ dict(type='LoadImageFromFile', color_type='color_ignore_orientation'),
54
+ dict(
55
+ type='MultiScaleFlipAug',
56
+ img_scale=(1333, 736), # used by Resize
57
+ flip=False,
58
+ transforms=[
59
+ dict(type='Resize', keep_ratio=True),
60
+ dict(type='Normalize', **img_norm_cfg),
61
+ dict(type='Pad', size_divisor=32),
62
+ dict(type='ImageToTensor', keys=['img']),
63
+ dict(type='Collect', keys=['img']),
64
+ ])
65
+ ]
configs/_base_/recog_datasets/MJ_train.py ADDED
@@ -0,0 +1,21 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: Syn90k
3
+
4
+ train_root = 'data/mixture/Syn90k'
5
+
6
+ train_img_prefix = f'{train_root}/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file = f'{train_root}/label.lmdb'
8
+
9
+ train = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix,
12
+ ann_file=train_ann_file,
13
+ loader=dict(
14
+ type='AnnFileLoader',
15
+ repeat=1,
16
+ file_format='lmdb',
17
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
18
+ pipeline=None,
19
+ test_mode=False)
20
+
21
+ train_list = [train]
configs/_base_/recog_datasets/ST_MJ_alphanumeric_train.py ADDED
@@ -0,0 +1,31 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+ # Both annotations are filtered so that
4
+ # only alphanumeric terms are left
5
+
6
+ train_root = 'data/mixture'
7
+
8
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
9
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
10
+
11
+ train1 = dict(
12
+ type='OCRDataset',
13
+ img_prefix=train_img_prefix1,
14
+ ann_file=train_ann_file1,
15
+ loader=dict(
16
+ type='AnnFileLoader',
17
+ repeat=1,
18
+ file_format='lmdb',
19
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
20
+ pipeline=None,
21
+ test_mode=False)
22
+
23
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
24
+ 'synthtext/SynthText_patch_horizontal'
25
+ train_ann_file2 = f'{train_root}/SynthText/alphanumeric_label.lmdb'
26
+
27
+ train2 = {key: value for key, value in train1.items()}
28
+ train2['img_prefix'] = train_img_prefix2
29
+ train2['ann_file'] = train_ann_file2
30
+
31
+ train_list = [train1, train2]
configs/_base_/recog_datasets/ST_MJ_train.py ADDED
@@ -0,0 +1,29 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+
4
+ train_root = 'data/mixture'
5
+
6
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
8
+
9
+ train1 = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix1,
12
+ ann_file=train_ann_file1,
13
+ loader=dict(
14
+ type='AnnFileLoader',
15
+ repeat=1,
16
+ file_format='lmdb',
17
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
18
+ pipeline=None,
19
+ test_mode=False)
20
+
21
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
22
+ 'synthtext/SynthText_patch_horizontal'
23
+ train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
24
+
25
+ train2 = {key: value for key, value in train1.items()}
26
+ train2['img_prefix'] = train_img_prefix2
27
+ train2['ann_file'] = train_ann_file2
28
+
29
+ train_list = [train1, train2]
configs/_base_/recog_datasets/ST_SA_MJ_real_train.py ADDED
@@ -0,0 +1,81 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, SynthAdd, Syn90k
3
+ # Real Dataset: IC11, IC13, IC15, COCO-Test, IIIT5k
4
+
5
+ train_prefix = 'data/mixture'
6
+
7
+ train_img_prefix1 = f'{train_prefix}/icdar_2011'
8
+ train_img_prefix2 = f'{train_prefix}/icdar_2013'
9
+ train_img_prefix3 = f'{train_prefix}/icdar_2015'
10
+ train_img_prefix4 = f'{train_prefix}/coco_text'
11
+ train_img_prefix5 = f'{train_prefix}/IIIT5K'
12
+ train_img_prefix6 = f'{train_prefix}/SynthText_Add'
13
+ train_img_prefix7 = f'{train_prefix}/SynthText'
14
+ train_img_prefix8 = f'{train_prefix}/Syn90k'
15
+
16
+ train_ann_file1 = f'{train_prefix}/icdar_2011/train_label.txt',
17
+ train_ann_file2 = f'{train_prefix}/icdar_2013/train_label.txt',
18
+ train_ann_file3 = f'{train_prefix}/icdar_2015/train_label.txt',
19
+ train_ann_file4 = f'{train_prefix}/coco_text/train_label.txt',
20
+ train_ann_file5 = f'{train_prefix}/IIIT5K/train_label.txt',
21
+ train_ann_file6 = f'{train_prefix}/SynthText_Add/label.txt',
22
+ train_ann_file7 = f'{train_prefix}/SynthText/shuffle_labels.txt',
23
+ train_ann_file8 = f'{train_prefix}/Syn90k/shuffle_labels.txt'
24
+
25
+ train1 = dict(
26
+ type='OCRDataset',
27
+ img_prefix=train_img_prefix1,
28
+ ann_file=train_ann_file1,
29
+ loader=dict(
30
+ type='AnnFileLoader',
31
+ repeat=20,
32
+ file_format='txt',
33
+ parser=dict(
34
+ type='LineStrParser',
35
+ keys=['filename', 'text'],
36
+ keys_idx=[0, 1],
37
+ separator=' ')),
38
+ pipeline=None,
39
+ test_mode=False)
40
+
41
+ train2 = {key: value for key, value in train1.items()}
42
+ train2['img_prefix'] = train_img_prefix2
43
+ train2['ann_file'] = train_ann_file2
44
+
45
+ train3 = {key: value for key, value in train1.items()}
46
+ train3['img_prefix'] = train_img_prefix3
47
+ train3['ann_file'] = train_ann_file3
48
+
49
+ train4 = {key: value for key, value in train1.items()}
50
+ train4['img_prefix'] = train_img_prefix4
51
+ train4['ann_file'] = train_ann_file4
52
+
53
+ train5 = {key: value for key, value in train1.items()}
54
+ train5['img_prefix'] = train_img_prefix5
55
+ train5['ann_file'] = train_ann_file5
56
+
57
+ train6 = dict(
58
+ type='OCRDataset',
59
+ img_prefix=train_img_prefix6,
60
+ ann_file=train_ann_file6,
61
+ loader=dict(
62
+ type='AnnFileLoader',
63
+ repeat=1,
64
+ file_format='txt',
65
+ parser=dict(
66
+ type='LineStrParser',
67
+ keys=['filename', 'text'],
68
+ keys_idx=[0, 1],
69
+ separator=' ')),
70
+ pipeline=None,
71
+ test_mode=False)
72
+
73
+ train7 = {key: value for key, value in train6.items()}
74
+ train7['img_prefix'] = train_img_prefix7
75
+ train7['ann_file'] = train_ann_file7
76
+
77
+ train8 = {key: value for key, value in train6.items()}
78
+ train8['img_prefix'] = train_img_prefix8
79
+ train8['ann_file'] = train_ann_file8
80
+
81
+ train_list = [train1, train2, train3, train4, train5, train6, train7, train8]
configs/_base_/recog_datasets/ST_SA_MJ_train.py ADDED
@@ -0,0 +1,48 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText, Syn90k
3
+
4
+ train_root = 'data/mixture'
5
+
6
+ train_img_prefix1 = f'{train_root}/Syn90k/mnt/ramdisk/max/90kDICT32px'
7
+ train_ann_file1 = f'{train_root}/Syn90k/label.lmdb'
8
+
9
+ train1 = dict(
10
+ type='OCRDataset',
11
+ img_prefix=train_img_prefix1,
12
+ ann_file=train_ann_file1,
13
+ loader=dict(
14
+ type='AnnFileLoader',
15
+ repeat=1,
16
+ file_format='lmdb',
17
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
18
+ pipeline=None,
19
+ test_mode=False)
20
+
21
+ train_img_prefix2 = f'{train_root}/SynthText/' + \
22
+ 'synthtext/SynthText_patch_horizontal'
23
+ train_ann_file2 = f'{train_root}/SynthText/label.lmdb'
24
+
25
+ train_img_prefix3 = f'{train_root}/SynthText_Add'
26
+ train_ann_file3 = f'{train_root}/SynthText_Add/label.txt'
27
+
28
+ train2 = {key: value for key, value in train1.items()}
29
+ train2['img_prefix'] = train_img_prefix2
30
+ train2['ann_file'] = train_ann_file2
31
+
32
+ train3 = dict(
33
+ type='OCRDataset',
34
+ img_prefix=train_img_prefix3,
35
+ ann_file=train_ann_file3,
36
+ loader=dict(
37
+ type='AnnFileLoader',
38
+ repeat=1,
39
+ file_format='txt',
40
+ parser=dict(
41
+ type='LineStrParser',
42
+ keys=['filename', 'text'],
43
+ keys_idx=[0, 1],
44
+ separator=' ')),
45
+ pipeline=None,
46
+ test_mode=False)
47
+
48
+ train_list = [train1, train2, train3]
configs/_base_/recog_datasets/ST_charbox_train.py ADDED
@@ -0,0 +1,23 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Training set, including:
2
+ # Synthetic Datasets: SynthText (with character level boxes)
3
+
4
+ train_img_root = 'data/mixture'
5
+
6
+ train_img_prefix = f'{train_img_root}/SynthText'
7
+
8
+ train_ann_file = f'{train_img_root}/SynthText/instances_train.txt'
9
+
10
+ train = dict(
11
+ type='OCRSegDataset',
12
+ img_prefix=train_img_prefix,
13
+ ann_file=train_ann_file,
14
+ loader=dict(
15
+ type='AnnFileLoader',
16
+ repeat=1,
17
+ file_format='txt',
18
+ parser=dict(
19
+ type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
20
+ pipeline=None,
21
+ test_mode=False)
22
+
23
+ train_list = [train]
configs/_base_/recog_datasets/academic_test.py ADDED
@@ -0,0 +1,57 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # Text Recognition Testing set, including:
2
+ # Regular Datasets: IIIT5K, SVT, IC13
3
+ # Irregular Datasets: IC15, SVTP, CT80
4
+
5
+ test_root = 'data/mixture'
6
+
7
+ test_img_prefix1 = f'{test_root}/IIIT5K/'
8
+ test_img_prefix2 = f'{test_root}/svt/'
9
+ test_img_prefix3 = f'{test_root}/icdar_2013/'
10
+ test_img_prefix4 = f'{test_root}/icdar_2015/'
11
+ test_img_prefix5 = f'{test_root}/svtp/'
12
+ test_img_prefix6 = f'{test_root}/ct80/'
13
+
14
+ test_ann_file1 = f'{test_root}/IIIT5K/test_label.txt'
15
+ test_ann_file2 = f'{test_root}/svt/test_label.txt'
16
+ test_ann_file3 = f'{test_root}/icdar_2013/test_label_1015.txt'
17
+ test_ann_file4 = f'{test_root}/icdar_2015/test_label.txt'
18
+ test_ann_file5 = f'{test_root}/svtp/test_label.txt'
19
+ test_ann_file6 = f'{test_root}/ct80/test_label.txt'
20
+
21
+ test1 = dict(
22
+ type='OCRDataset',
23
+ img_prefix=test_img_prefix1,
24
+ ann_file=test_ann_file1,
25
+ loader=dict(
26
+ type='AnnFileLoader',
27
+ repeat=1,
28
+ file_format='txt',
29
+ parser=dict(
30
+ type='LineStrParser',
31
+ keys=['filename', 'text'],
32
+ keys_idx=[0, 1],
33
+ separator=' ')),
34
+ pipeline=None,
35
+ test_mode=True)
36
+
37
+ test2 = {key: value for key, value in test1.items()}
38
+ test2['img_prefix'] = test_img_prefix2
39
+ test2['ann_file'] = test_ann_file2
40
+
41
+ test3 = {key: value for key, value in test1.items()}
42
+ test3['img_prefix'] = test_img_prefix3
43
+ test3['ann_file'] = test_ann_file3
44
+
45
+ test4 = {key: value for key, value in test1.items()}
46
+ test4['img_prefix'] = test_img_prefix4
47
+ test4['ann_file'] = test_ann_file4
48
+
49
+ test5 = {key: value for key, value in test1.items()}
50
+ test5['img_prefix'] = test_img_prefix5
51
+ test5['ann_file'] = test_ann_file5
52
+
53
+ test6 = {key: value for key, value in test1.items()}
54
+ test6['img_prefix'] = test_img_prefix6
55
+ test6['ann_file'] = test_ann_file6
56
+
57
+ test_list = [test1, test2, test3, test4, test5, test6]
configs/_base_/recog_datasets/seg_toy_data.py ADDED
@@ -0,0 +1,34 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ prefix = 'tests/data/ocr_char_ann_toy_dataset/'
2
+
3
+ train = dict(
4
+ type='OCRSegDataset',
5
+ img_prefix=f'{prefix}/imgs',
6
+ ann_file=f'{prefix}/instances_train.txt',
7
+ loader=dict(
8
+ type='AnnFileLoader',
9
+ repeat=100,
10
+ file_format='txt',
11
+ parser=dict(
12
+ type='LineJsonParser', keys=['file_name', 'annotations', 'text'])),
13
+ pipeline=None,
14
+ test_mode=True)
15
+
16
+ test = dict(
17
+ type='OCRDataset',
18
+ img_prefix=f'{prefix}/imgs',
19
+ ann_file=f'{prefix}/instances_test.txt',
20
+ loader=dict(
21
+ type='AnnFileLoader',
22
+ repeat=1,
23
+ file_format='txt',
24
+ parser=dict(
25
+ type='LineStrParser',
26
+ keys=['filename', 'text'],
27
+ keys_idx=[0, 1],
28
+ separator=' ')),
29
+ pipeline=None,
30
+ test_mode=True)
31
+
32
+ train_list = [train]
33
+
34
+ test_list = [test]
configs/_base_/recog_datasets/toy_data.py ADDED
@@ -0,0 +1,54 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ dataset_type = 'OCRDataset'
2
+
3
+ root = 'tests/data/ocr_toy_dataset'
4
+ img_prefix = f'{root}/imgs'
5
+ train_anno_file1 = f'{root}/label.txt'
6
+
7
+ train1 = dict(
8
+ type=dataset_type,
9
+ img_prefix=img_prefix,
10
+ ann_file=train_anno_file1,
11
+ loader=dict(
12
+ type='AnnFileLoader',
13
+ repeat=100,
14
+ file_format='txt',
15
+ file_storage_backend='disk',
16
+ parser=dict(
17
+ type='LineStrParser',
18
+ keys=['filename', 'text'],
19
+ keys_idx=[0, 1],
20
+ separator=' ')),
21
+ pipeline=None,
22
+ test_mode=False)
23
+
24
+ train_anno_file2 = f'{root}/label.lmdb'
25
+ train2 = dict(
26
+ type=dataset_type,
27
+ img_prefix=img_prefix,
28
+ ann_file=train_anno_file2,
29
+ loader=dict(
30
+ type='AnnFileLoader',
31
+ repeat=100,
32
+ file_format='lmdb',
33
+ file_storage_backend='disk',
34
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
35
+ pipeline=None,
36
+ test_mode=False)
37
+
38
+ test_anno_file1 = f'{root}/label.lmdb'
39
+ test = dict(
40
+ type=dataset_type,
41
+ img_prefix=img_prefix,
42
+ ann_file=test_anno_file1,
43
+ loader=dict(
44
+ type='AnnFileLoader',
45
+ repeat=1,
46
+ file_format='lmdb',
47
+ file_storage_backend='disk',
48
+ parser=dict(type='LineJsonParser', keys=['filename', 'text'])),
49
+ pipeline=None,
50
+ test_mode=True)
51
+
52
+ train_list = [train1, train2]
53
+
54
+ test_list = [test]
configs/_base_/recog_models/abinet.py ADDED
@@ -0,0 +1,70 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # num_chars depends on the configuration of label_convertor. The actual
2
+ # dictionary size is 36 + 1 (<BOS/EOS>).
3
+ # TODO: Automatically update num_chars based on the configuration of
4
+ # label_convertor
5
+ num_chars = 37
6
+ max_seq_len = 26
7
+
8
+ label_convertor = dict(
9
+ type='ABIConvertor',
10
+ dict_type='DICT36',
11
+ with_unknown=False,
12
+ with_padding=False,
13
+ lower=True,
14
+ )
15
+
16
+ model = dict(
17
+ type='ABINet',
18
+ backbone=dict(type='ResNetABI'),
19
+ encoder=dict(
20
+ type='ABIVisionModel',
21
+ encoder=dict(
22
+ type='TransformerEncoder',
23
+ n_layers=3,
24
+ n_head=8,
25
+ d_model=512,
26
+ d_inner=2048,
27
+ dropout=0.1,
28
+ max_len=8 * 32,
29
+ ),
30
+ decoder=dict(
31
+ type='ABIVisionDecoder',
32
+ in_channels=512,
33
+ num_channels=64,
34
+ attn_height=8,
35
+ attn_width=32,
36
+ attn_mode='nearest',
37
+ use_result='feature',
38
+ num_chars=num_chars,
39
+ max_seq_len=max_seq_len,
40
+ init_cfg=dict(type='Xavier', layer='Conv2d')),
41
+ ),
42
+ decoder=dict(
43
+ type='ABILanguageDecoder',
44
+ d_model=512,
45
+ n_head=8,
46
+ d_inner=2048,
47
+ n_layers=4,
48
+ dropout=0.1,
49
+ detach_tokens=True,
50
+ use_self_attn=False,
51
+ pad_idx=num_chars - 1,
52
+ num_chars=num_chars,
53
+ max_seq_len=max_seq_len,
54
+ init_cfg=None),
55
+ fuser=dict(
56
+ type='ABIFuser',
57
+ d_model=512,
58
+ num_chars=num_chars,
59
+ init_cfg=None,
60
+ max_seq_len=max_seq_len,
61
+ ),
62
+ loss=dict(
63
+ type='ABILoss',
64
+ enc_weight=1.0,
65
+ dec_weight=1.0,
66
+ fusion_weight=1.0,
67
+ num_classes=num_chars),
68
+ label_convertor=label_convertor,
69
+ max_seq_len=max_seq_len,
70
+ iter_size=3)
configs/_base_/recog_models/crnn.py ADDED
@@ -0,0 +1,12 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ label_convertor = dict(
2
+ type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
3
+
4
+ model = dict(
5
+ type='CRNNNet',
6
+ preprocessor=None,
7
+ backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
8
+ encoder=None,
9
+ decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
10
+ loss=dict(type='CTCLoss'),
11
+ label_convertor=label_convertor,
12
+ pretrained=None)
configs/_base_/recog_models/crnn_tps.py ADDED
@@ -0,0 +1,18 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ # model
2
+ label_convertor = dict(
3
+ type='CTCConvertor', dict_type='DICT36', with_unknown=False, lower=True)
4
+
5
+ model = dict(
6
+ type='CRNNNet',
7
+ preprocessor=dict(
8
+ type='TPSPreprocessor',
9
+ num_fiducial=20,
10
+ img_size=(32, 100),
11
+ rectified_img_size=(32, 100),
12
+ num_img_channel=1),
13
+ backbone=dict(type='VeryDeepVgg', leaky_relu=False, input_channels=1),
14
+ encoder=None,
15
+ decoder=dict(type='CRNNDecoder', in_channels=512, rnn_flag=True),
16
+ loss=dict(type='CTCLoss'),
17
+ label_convertor=label_convertor,
18
+ pretrained=None)