napatswift commited on
Commit
fa55425
β€’
1 Parent(s): 6cb798d

Add text recognition model

Browse files
main.py CHANGED
@@ -3,8 +3,11 @@ import gradio as gr
3
  import cv2
4
  import sys
5
 
6
- ocr = MMOCR(det_config='model/config.py',
7
- det_ckpt='model/model.pth', device='cpu')
 
 
 
8
 
9
  def get_rec(points):
10
  xs = []
@@ -34,13 +37,13 @@ def predict(image_input, score_threshold):
34
  p0, p1 = get_rec([int(i) for i in polygon])
35
  draw_img = cv2.rectangle(draw_img, p0, p1, (255,0,0), 2)
36
 
37
- return draw_img
38
 
39
  def run():
40
  demo = gr.Interface(
41
  fn=predict,
42
  inputs=[gr.components.Image(), gr.Slider(0, 1, 0.8)],
43
- outputs=gr.components.Image(),
44
  )
45
 
46
  demo.launch(server_name="0.0.0.0", server_port=7860)
 
3
  import cv2
4
  import sys
5
 
6
+ ocr = MMOCR(det_config='model/det/config.py',
7
+ det_ckpt='model/det/model.pth',
8
+ recog_config='model/recog/config.py',
9
+ recog_ckpt='model/det/model.pth',
10
+ device='cpu')
11
 
12
  def get_rec(points):
13
  xs = []
 
37
  p0, p1 = get_rec([int(i) for i in polygon])
38
  draw_img = cv2.rectangle(draw_img, p0, p1, (255,0,0), 2)
39
 
40
+ return draw_img, output
41
 
42
  def run():
43
  demo = gr.Interface(
44
  fn=predict,
45
  inputs=[gr.components.Image(), gr.Slider(0, 1, 0.8)],
46
+ outputs=[gr.components.Image(), gr.JSON()],
47
  )
48
 
49
  demo.launch(server_name="0.0.0.0", server_port=7860)
model/{20230224_051330.log β†’ det/20230224_051330.log} RENAMED
File without changes
model/{config.py β†’ det/config.py} RENAMED
File without changes
model/{model.pth β†’ det/model.pth} RENAMED
File without changes
model/recog/config.py ADDED
@@ -0,0 +1,298 @@
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
 
1
+ thvl_textrecog_data_root = 'data/recog/synTH'
2
+ thvl_textrecog_train = dict(
3
+ type='OCRDataset',
4
+ data_root='data/recog/synTH',
5
+ ann_file='textrecog_train.json',
6
+ pipeline=None)
7
+ thvl_textrecog_test = dict(
8
+ type='OCRDataset',
9
+ data_root='data/recog/synTH',
10
+ ann_file='textrecog_test.json',
11
+ test_mode=True,
12
+ pipeline=None)
13
+ default_scope = 'mmocr'
14
+ env_cfg = dict(
15
+ cudnn_benchmark=True,
16
+ mp_cfg=dict(mp_start_method='fork', opencv_num_threads=0),
17
+ dist_cfg=dict(backend='nccl'))
18
+ randomness = dict(seed=None)
19
+ default_hooks = dict(
20
+ timer=dict(type='IterTimerHook'),
21
+ logger=dict(type='LoggerHook', interval=100),
22
+ param_scheduler=dict(type='ParamSchedulerHook'),
23
+ checkpoint=dict(type='CheckpointHook', interval=1),
24
+ sampler_seed=dict(type='DistSamplerSeedHook'),
25
+ sync_buffer=dict(type='SyncBuffersHook'),
26
+ visualization=dict(
27
+ type='VisualizationHook',
28
+ interval=1,
29
+ enable=False,
30
+ show=False,
31
+ draw_gt=False,
32
+ draw_pred=False))
33
+ log_level = 'INFO'
34
+ log_processor = dict(type='LogProcessor', window_size=10, by_epoch=True)
35
+ load_from = None
36
+ resume = False
37
+ val_evaluator = dict(
38
+ type='MultiDatasetsEvaluator',
39
+ metrics=[
40
+ dict(
41
+ type='WordMetric',
42
+ mode=['exact', 'ignore_case', 'ignore_case_symbol']),
43
+ dict(type='CharMetric')
44
+ ],
45
+ dataset_prefixes=None)
46
+ test_evaluator = dict(
47
+ type='MultiDatasetsEvaluator',
48
+ metrics=[
49
+ dict(
50
+ type='WordMetric',
51
+ mode=['exact', 'ignore_case', 'ignore_case_symbol']),
52
+ dict(type='CharMetric')
53
+ ],
54
+ dataset_prefixes=None)
55
+ vis_backends = [dict(type='LocalVisBackend')]
56
+ visualizer = dict(
57
+ type='TextRecogLocalVisualizer',
58
+ name='visualizer',
59
+ vis_backends=[dict(type='LocalVisBackend')])
60
+ optim_wrapper = dict(
61
+ type='OptimWrapper', optimizer=dict(type='Adam', lr=0.0003))
62
+ train_cfg = dict(type='EpochBasedTrainLoop', max_epochs=50, val_interval=1)
63
+ val_cfg = dict(type='ValLoop')
64
+ test_cfg = dict(type='TestLoop')
65
+ param_scheduler = [dict(type='MultiStepLR', milestones=[3, 4], end=6)]
66
+ file_client_args = dict(backend='disk')
67
+ dictionary = dict(
68
+ type='Dictionary',
69
+ dict_file=
70
+ '/content/mmocr/configs/textrecog/nrtr/../../../dicts/thai_digits_symbols.txt',
71
+ with_padding=True,
72
+ with_unknown=True,
73
+ same_start_end=True,
74
+ with_start=True,
75
+ with_end=True)
76
+ model = dict(
77
+ type='NRTR',
78
+ backbone=dict(type='NRTRModalityTransform'),
79
+ encoder=dict(type='NRTREncoder', n_layers=12),
80
+ decoder=dict(
81
+ type='NRTRDecoder',
82
+ module_loss=dict(
83
+ type='CEModuleLoss', ignore_first_char=True, flatten=True),
84
+ postprocessor=dict(type='AttentionPostprocessor'),
85
+ dictionary=dict(
86
+ type='Dictionary',
87
+ dict_file=
88
+ '/content/mmocr/configs/textrecog/nrtr/../../../dicts/thai_digits_symbols.txt',
89
+ with_padding=True,
90
+ with_unknown=True,
91
+ same_start_end=True,
92
+ with_start=True,
93
+ with_end=True),
94
+ max_seq_len=30),
95
+ data_preprocessor=dict(
96
+ type='TextRecogDataPreprocessor',
97
+ mean=[123.675, 116.28, 103.53],
98
+ std=[58.395, 57.12, 57.375]))
99
+ train_pipeline = [
100
+ dict(
101
+ type='LoadImageFromFile',
102
+ file_client_args=dict(backend='disk'),
103
+ ignore_empty=True,
104
+ min_size=2),
105
+ dict(type='LoadOCRAnnotations', with_text=True),
106
+ dict(
107
+ type='RescaleToHeight',
108
+ height=32,
109
+ min_width=32,
110
+ max_width=160,
111
+ width_divisor=4),
112
+ dict(type='PadToWidth', width=160),
113
+ dict(
114
+ type='PackTextRecogInputs',
115
+ meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
116
+ ]
117
+ test_pipeline = [
118
+ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
119
+ dict(
120
+ type='RescaleToHeight',
121
+ height=32,
122
+ min_width=32,
123
+ max_width=160,
124
+ width_divisor=16),
125
+ dict(type='PadToWidth', width=160),
126
+ dict(type='LoadOCRAnnotations', with_text=True),
127
+ dict(
128
+ type='PackTextRecogInputs',
129
+ meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
130
+ ]
131
+ train_list = [
132
+ dict(
133
+ type='OCRDataset',
134
+ data_root='data/recog/synTH',
135
+ ann_file='textrecog_train.json',
136
+ pipeline=None)
137
+ ]
138
+ test_list = [
139
+ dict(
140
+ type='OCRDataset',
141
+ data_root='data/recog/synTH',
142
+ ann_file='textrecog_test.json',
143
+ test_mode=True,
144
+ pipeline=None)
145
+ ]
146
+ train_dataset = dict(
147
+ type='ConcatDataset',
148
+ datasets=[
149
+ dict(
150
+ type='OCRDataset',
151
+ data_root='data/recog/synTH',
152
+ ann_file='textrecog_train.json',
153
+ pipeline=None)
154
+ ],
155
+ pipeline=[
156
+ dict(
157
+ type='LoadImageFromFile',
158
+ file_client_args=dict(backend='disk'),
159
+ ignore_empty=True,
160
+ min_size=2),
161
+ dict(type='LoadOCRAnnotations', with_text=True),
162
+ dict(
163
+ type='RescaleToHeight',
164
+ height=32,
165
+ min_width=32,
166
+ max_width=160,
167
+ width_divisor=4),
168
+ dict(type='PadToWidth', width=160),
169
+ dict(
170
+ type='PackTextRecogInputs',
171
+ meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
172
+ ])
173
+ test_dataset = dict(
174
+ type='ConcatDataset',
175
+ datasets=[
176
+ dict(
177
+ type='OCRDataset',
178
+ data_root='data/recog/synTH',
179
+ ann_file='textrecog_test.json',
180
+ test_mode=True,
181
+ pipeline=None)
182
+ ],
183
+ pipeline=[
184
+ dict(type='LoadImageFromFile', file_client_args=dict(backend='disk')),
185
+ dict(
186
+ type='RescaleToHeight',
187
+ height=32,
188
+ min_width=32,
189
+ max_width=160,
190
+ width_divisor=16),
191
+ dict(type='PadToWidth', width=160),
192
+ dict(type='LoadOCRAnnotations', with_text=True),
193
+ dict(
194
+ type='PackTextRecogInputs',
195
+ meta_keys=('img_path', 'ori_shape', 'img_shape', 'valid_ratio'))
196
+ ])
197
+ train_dataloader = dict(
198
+ batch_size=384,
199
+ num_workers=24,
200
+ persistent_workers=True,
201
+ sampler=dict(type='DefaultSampler', shuffle=True),
202
+ dataset=dict(
203
+ type='ConcatDataset',
204
+ datasets=[
205
+ dict(
206
+ type='OCRDataset',
207
+ data_root='data/recog/synTH',
208
+ ann_file='textrecog_train.json',
209
+ pipeline=None)
210
+ ],
211
+ pipeline=[
212
+ dict(
213
+ type='LoadImageFromFile',
214
+ file_client_args=dict(backend='disk'),
215
+ ignore_empty=True,
216
+ min_size=2),
217
+ dict(type='LoadOCRAnnotations', with_text=True),
218
+ dict(
219
+ type='RescaleToHeight',
220
+ height=32,
221
+ min_width=32,
222
+ max_width=160,
223
+ width_divisor=4),
224
+ dict(type='PadToWidth', width=160),
225
+ dict(
226
+ type='PackTextRecogInputs',
227
+ meta_keys=('img_path', 'ori_shape', 'img_shape',
228
+ 'valid_ratio'))
229
+ ]))
230
+ test_dataloader = dict(
231
+ batch_size=1,
232
+ num_workers=4,
233
+ persistent_workers=True,
234
+ drop_last=False,
235
+ sampler=dict(type='DefaultSampler', shuffle=False),
236
+ dataset=dict(
237
+ type='ConcatDataset',
238
+ datasets=[
239
+ dict(
240
+ type='OCRDataset',
241
+ data_root='data/recog/synTH',
242
+ ann_file='textrecog_test.json',
243
+ test_mode=True,
244
+ pipeline=None)
245
+ ],
246
+ pipeline=[
247
+ dict(
248
+ type='LoadImageFromFile',
249
+ file_client_args=dict(backend='disk')),
250
+ dict(
251
+ type='RescaleToHeight',
252
+ height=32,
253
+ min_width=32,
254
+ max_width=160,
255
+ width_divisor=16),
256
+ dict(type='PadToWidth', width=160),
257
+ dict(type='LoadOCRAnnotations', with_text=True),
258
+ dict(
259
+ type='PackTextRecogInputs',
260
+ meta_keys=('img_path', 'ori_shape', 'img_shape',
261
+ 'valid_ratio'))
262
+ ]))
263
+ val_dataloader = dict(
264
+ batch_size=1,
265
+ num_workers=4,
266
+ persistent_workers=True,
267
+ drop_last=False,
268
+ sampler=dict(type='DefaultSampler', shuffle=False),
269
+ dataset=dict(
270
+ type='ConcatDataset',
271
+ datasets=[
272
+ dict(
273
+ type='OCRDataset',
274
+ data_root='data/recog/synTH',
275
+ ann_file='textrecog_test.json',
276
+ test_mode=True,
277
+ pipeline=None)
278
+ ],
279
+ pipeline=[
280
+ dict(
281
+ type='LoadImageFromFile',
282
+ file_client_args=dict(backend='disk')),
283
+ dict(
284
+ type='RescaleToHeight',
285
+ height=32,
286
+ min_width=32,
287
+ max_width=160,
288
+ width_divisor=16),
289
+ dict(type='PadToWidth', width=160),
290
+ dict(type='LoadOCRAnnotations', with_text=True),
291
+ dict(
292
+ type='PackTextRecogInputs',
293
+ meta_keys=('img_path', 'ori_shape', 'img_shape',
294
+ 'valid_ratio'))
295
+ ]))
296
+ auto_scale_lr = dict(base_batch_size=384)
297
+ launcher = 'none'
298
+ work_dir = './work_dirs/nrtr_modality-transform_50e_thvl'
model/recog/model.pth ADDED
@@ -0,0 +1,3 @@
 
 
 
 
1
+ version https://git-lfs.github.com/spec/v1
2
+ oid sha256:5b8611b397bd03159632e0e2b63f30f0f0ce0089107255fdf934dc94a063f59b
3
+ size 365107614